Repository: clamchowder/Microbenchmarks
Branch: master
Commit: 13159d44086d
Files: 315
Total size: 3.0 MB

Directory structure:
gitextract_nqotrkr3/

├── .github/
│   └── workflows/
│       └── linux.yaml
├── .gitignore
├── AsmGen/
│   ├── AsmGen.csproj
│   ├── AsmGen.sln
│   ├── DataFiles/
│   │   ├── BranchhistTestBlock.c
│   │   ├── CommonFunctions.c
│   │   ├── GccBranchHistFunction.c
│   │   ├── GccIndirectBranchFunction.c
│   │   ├── IndirectBranchTestBlock.c
│   │   └── clammicrobench.vcxproj_template
│   ├── IUarchTest.cs
│   ├── Program.cs
│   ├── Properties/
│   │   └── launchSettings.json
│   ├── README.md
│   ├── UarchTest.cs
│   ├── UarchTestHelpers.cs
│   └── tests/
│       ├── A73RobTest.cs
│       ├── AddLoopTest.cs
│       ├── AddNsq.cs
│       ├── AddSchedTest.cs
│       ├── AddvNsq.cs
│       ├── AddvSchedTest.cs
│       ├── AeseSchedTest.cs
│       ├── AesencNsq.cs
│       ├── BranchBufferTest.cs
│       ├── BranchHistoryTest.cs
│       ├── BtbTest.cs
│       ├── CvtSchedTest.cs
│       ├── FAdd256RfTest.cs
│       ├── Fadd128RfTest.cs
│       ├── Fadd128SchedTest.cs
│       ├── Fadd256SchedTest.cs
│       ├── FaddNsq.cs
│       ├── FaddSchedTest.cs
│       ├── FcmpSchedTest.cs
│       ├── FlagRfTest.cs
│       ├── Fma256SchedTest.cs
│       ├── FmovSched.cs
│       ├── FmulSchedTest.cs
│       ├── FpRfTest.cs
│       ├── FpStoreDataNsq.cs
│       ├── IdrfTest.cs
│       ├── IndirectBranchTest.cs
│       ├── IntRfDepStoreTest.cs
│       ├── IntRfTest.cs
│       ├── JsCvtNsq.cs
│       ├── JsCvtSched.cs
│       ├── JumpNsqTest.cs
│       ├── JumpSchedTest.cs
│       ├── LdqTest.cs
│       ├── LeaSchedTest.cs
│       ├── LoadNsq.cs
│       ├── LoadSchedTest.cs
│       ├── MaddSchedTest.cs
│       ├── MaskRfTest.cs
│       ├── MixAddJumpSched.cs
│       ├── MixAddvJsCvtNsq.cs
│       ├── MixAddvJsCvtSched.cs
│       ├── MixBranchStoreTest.cs
│       ├── MixFAdd256and32RfTest.cs
│       ├── MixFpRfDepBranchTest.cs
│       ├── MixFpVecRfTest.cs
│       ├── MixIntRfDepBranchTest.cs
│       ├── MixIntVec128RfTest.cs
│       ├── MixIntrfFprfTest.cs
│       ├── MixJumpStoreDataSched.cs
│       ├── MixJumpStoreSchedTest.cs
│       ├── MixJumpThenAddSched.cs
│       ├── MixLdqStqTest.cs
│       ├── MixLoadStoreDivSchedTest.cs
│       ├── MixLoadStoreSchedTest.cs
│       ├── MixStoreDivSchedTest.cs
│       ├── MixVec512Vec256BlockRfTest.cs
│       ├── MixVec512Vec256RfTest.cs
│       ├── MmxRfTest.cs
│       ├── MulSchedTest.cs
│       ├── NopLoopTest.cs
│       ├── PdepSchedTest.cs
│       ├── ReturnStackTest.cs
│       ├── RobTest.cs
│       ├── RorSchedTest.cs
│       ├── ShlSchedTest.cs
│       ├── StoreDataDivNsqTest.cs
│       ├── StoreDataNsqTest.cs
│       ├── StoreDataSchedTest.cs
│       ├── StoreDivNsqTest.cs
│       ├── StoreDivSchedTest.cs
│       ├── StoreNsq.cs
│       ├── StoreSchedTest.cs
│       ├── Stq128Test.cs
│       ├── Stq512Test.cs
│       ├── StqTest.cs
│       ├── TakenBranchBufferTest.cs
│       ├── TakenJumpSchedTest.cs
│       ├── Vec512RfTest.cs
│       ├── VecMulNsq.cs
│       └── ZeroRobTest.cs
├── CoherencyLatency/
│   ├── CoherencyLatency.cpp
│   ├── CoherencyLatency.sln
│   ├── CoherencyLatency.vcxproj
│   ├── Makefile
│   ├── PThreadsCoherencyLatency.c
│   └── c2cparse/
│       ├── Program.cs
│       ├── c2cparse.csproj
│       └── c2cparse.sln
├── Common/
│   ├── arch_detect.mk
│   ├── ci_gpumemlatency.sh
│   ├── ci_package.sh
│   ├── perfmon.h
│   ├── timing.c
│   └── timing.h
├── CoreClockChecker/
│   ├── BoostClockChecker.c
│   ├── BoostClockChecker_arm.s
│   ├── BoostClockChecker_x86.s
│   ├── CoreClockChecker.c
│   ├── CoreClockChecker_x86.s
│   ├── Makefile
│   └── WinCoreClockChecker/
│       ├── CoreClockCheckFunctions.asm
│       ├── WinCoreClockChecker.cpp
│       ├── WinCoreClockChecker.sln
│       ├── WinCoreClockChecker.vcxproj
│       └── WinCoreClockChecker.vcxproj.filters
├── GpuMemLatency/
│   ├── Makefile
│   ├── OpenCL/
│   │   ├── LICENSE
│   │   ├── README.md
│   │   ├── include/
│   │   │   └── CL/
│   │   │       ├── cl.h
│   │   │       ├── cl_d3d10.h
│   │   │       ├── cl_d3d11.h
│   │   │       ├── cl_dx9_media_sharing.h
│   │   │       ├── cl_dx9_media_sharing_intel.h
│   │   │       ├── cl_egl.h
│   │   │       ├── cl_ext.h
│   │   │       ├── cl_ext_intel.h
│   │   │       ├── cl_gl.h
│   │   │       ├── cl_gl_ext.h
│   │   │       ├── cl_half.h
│   │   │       ├── cl_icd.h
│   │   │       ├── cl_platform.h
│   │   │       ├── cl_va_api_media_sharing_intel.h
│   │   │       ├── cl_version.h
│   │   │       └── opencl.h
│   │   └── lib/
│   │       └── OpenCL.lib
│   ├── atomic_test.c
│   ├── bw_test.c
│   ├── common.c
│   ├── instruction_rate.c
│   ├── instruction_rate_fp16_kernel.cl
│   ├── instruction_rate_fp64_kernel.cl
│   ├── instruction_rate_kernel.cl
│   ├── kernel.cl
│   ├── kernels/
│   │   ├── atomic_exec_latency_test.cl
│   │   ├── buffer_bw_test.cl
│   │   ├── c2c_atomic_exec_latency_test.cl
│   │   ├── constant_unrolled_latency_test.cl
│   │   ├── ldst_bw_test.cl
│   │   ├── local_64_bw_test.cl
│   │   ├── local_atomic_latency_test.cl
│   │   ├── local_bw_test.cl
│   │   ├── local_float4_bw_test.cl
│   │   ├── local_unrolled_latency_test.cl
│   │   ├── scalar_unrolled_latency_test.cl
│   │   ├── sum_bw_test.cl
│   │   ├── tex_bw_test.cl
│   │   ├── tex_latency_test.cl
│   │   └── unrolled_latency_test.cl
│   ├── latency_test.c
│   ├── local_mem_latency_kernel.cl
│   ├── opencltest.c
│   ├── opencltest.h
│   ├── opencltest.sln
│   ├── opencltest.vcxproj
│   ├── opencltest.vcxproj.filters
│   └── texturetest.c
├── InstructionRate/
│   ├── Makefile
│   ├── arm_instructionrate.c
│   ├── arm_instructionrate.s
│   ├── riscv_instructionrate.c
│   ├── riscv_instructionrate.s
│   ├── test.s
│   ├── x86_fusion.c
│   ├── x86_fusion.s
│   ├── x86_instructionrate.c
│   └── x86_instructionrate.s
├── LICENSE
├── LoadedMemoryLatency/
│   ├── LoadedMemoryLatency/
│   │   ├── LoadedMemoryLatency.asm
│   │   ├── LoadedMemoryLatency.cpp
│   │   ├── LoadedMemoryLatency.sln
│   │   ├── LoadedMemoryLatency.vcxproj
│   │   └── LoadedMemoryLatency.vcxproj.filters
│   ├── LoadedMemoryLatency.c
│   ├── LoadedMemoryLatency_amd64.s
│   ├── LoadedMemoryLatency_arm.s
│   └── Makefile
├── Makefile
├── MemoryBandwidth/
│   ├── Makefile
│   ├── MemoryBandwidth/
│   │   ├── MemoryBandwidth.cpp
│   │   ├── MemoryBandwidth.sln
│   │   ├── MemoryBandwidth.vcxproj
│   │   ├── MemoryBandwidth.vcxproj.filters
│   │   ├── MemoryBandwidthFunctions.asm
│   │   └── MemoryBandwidthFunctions32.asm
│   ├── MemoryBandwidth.c
│   ├── MemoryBandwidth_arm.s
│   ├── MemoryBandwidth_riscv.s
│   ├── MemoryBandwidth_x86.s
│   ├── MixedMemoryBandwidthTest/
│   │   ├── MemoryBandwidth.h
│   │   ├── MemoryBandwidthFunctions.asm
│   │   ├── MixedMemoryBandwidthTest.cpp
│   │   ├── MixedMemoryBandwidthTest.vcxproj
│   │   └── MixedMemoryBandwidthTest.vcxproj.filters
│   └── README.md
├── MemoryLatency/
│   ├── Makefile
│   ├── MemoryLatency.c
│   ├── MemoryLatency.cpp
│   ├── MemoryLatency.sln
│   ├── MemoryLatency.vcxproj
│   ├── MemoryLatencyFunctions.asm
│   ├── MemoryLatency_arm.s
│   ├── MemoryLatency_i686.s
│   ├── MemoryLatency_riscv.s
│   ├── MemoryLatency_x86.s
│   └── README.md
├── README.md
├── mt_instructionrate/
│   ├── InstructionRateFunctions.asm
│   ├── Makefile
│   ├── Project1.vcxproj
│   ├── Project1.vcxproj.filters
│   ├── arm_mt_instructionrate.c
│   ├── arm_mt_instructionrate.s
│   ├── mt_instructionrate.c
│   ├── mt_instructionrate.sln
│   ├── ppc64_mt_instructionrate.c
│   ├── ppc64_mt_instructionrate.s
│   ├── x86_mt_instructionrate
│   ├── x86_mt_instructionrate.c
│   └── x86_mt_instructionrate.s
└── svm/
    ├── OpenCL/
    │   ├── include/
    │   │   └── CL/
    │   │       ├── Utils/
    │   │       │   ├── Context.h
    │   │       │   ├── Context.hpp
    │   │       │   ├── Detail.hpp
    │   │       │   ├── Device.hpp
    │   │       │   ├── Error.h
    │   │       │   ├── Error.hpp
    │   │       │   ├── ErrorCodes.h
    │   │       │   ├── Event.h
    │   │       │   ├── Event.hpp
    │   │       │   ├── File.h
    │   │       │   ├── File.hpp
    │   │       │   ├── InteropContext.hpp
    │   │       │   ├── OpenCLUtilsCpp_Export.h
    │   │       │   ├── OpenCLUtils_Export.h
    │   │       │   ├── Platform.hpp
    │   │       │   ├── Utils.h
    │   │       │   └── Utils.hpp
    │   │       ├── cl.h
    │   │       ├── cl2.hpp
    │   │       ├── cl_d3d10.h
    │   │       ├── cl_d3d11.h
    │   │       ├── cl_dx9_media_sharing.h
    │   │       ├── cl_dx9_media_sharing_intel.h
    │   │       ├── cl_egl.h
    │   │       ├── cl_ext.h
    │   │       ├── cl_ext_intel.h
    │   │       ├── cl_function_types.h
    │   │       ├── cl_gl.h
    │   │       ├── cl_gl_ext.h
    │   │       ├── cl_half.h
    │   │       ├── cl_icd.h
    │   │       ├── cl_layer.h
    │   │       ├── cl_platform.h
    │   │       ├── cl_va_api_media_sharing_intel.h
    │   │       ├── cl_version.h
    │   │       ├── opencl.h
    │   │       └── opencl.hpp
    │   ├── lib/
    │   │   ├── OpenCL.lib
    │   │   ├── OpenCLExt.lib
    │   │   ├── OpenCLUtils.lib
    │   │   ├── OpenCLUtilsCpp.lib
    │   │   ├── OpenCLUtilsCppd.lib
    │   │   ├── OpenCLUtilsd.lib
    │   │   └── pkgconfig/
    │   │       └── OpenCL.pc
    │   └── share/
    │       ├── cmake/
    │       │   ├── OpenCL/
    │       │   │   ├── OpenCLConfig.cmake
    │       │   │   └── OpenCLConfigVersion.cmake
    │       │   ├── OpenCLExtensionLoader/
    │       │   │   ├── OpenCLExtensionLoaderConfig.cmake
    │       │   │   ├── OpenCLExtensionLoaderConfigVersion.cmake
    │       │   │   ├── OpenCLExtensionLoaderTargets-debug.cmake
    │       │   │   ├── OpenCLExtensionLoaderTargets-release.cmake
    │       │   │   └── OpenCLExtensionLoaderTargets.cmake
    │       │   ├── OpenCLHeaders/
    │       │   │   ├── OpenCLHeadersConfig.cmake
    │       │   │   ├── OpenCLHeadersConfigVersion.cmake
    │       │   │   └── OpenCLHeadersTargets.cmake
    │       │   ├── OpenCLHeadersCpp/
    │       │   │   ├── OpenCLHeadersCppConfig.cmake
    │       │   │   ├── OpenCLHeadersCppConfigVersion.cmake
    │       │   │   └── OpenCLHeadersCppTargets.cmake
    │       │   ├── OpenCLICDLoader/
    │       │   │   ├── OpenCLICDLoaderConfig.cmake
    │       │   │   ├── OpenCLICDLoaderConfigVersion.cmake
    │       │   │   ├── OpenCLICDLoaderTargets-debug.cmake
    │       │   │   ├── OpenCLICDLoaderTargets-release.cmake
    │       │   │   └── OpenCLICDLoaderTargets.cmake
    │       │   ├── OpenCLUtils/
    │       │   │   ├── OpenCLUtilsConfig.cmake
    │       │   │   ├── OpenCLUtilsConfigVersion.cmake
    │       │   │   ├── OpenCLUtilsTargets-debug.cmake
    │       │   │   ├── OpenCLUtilsTargets-release.cmake
    │       │   │   └── OpenCLUtilsTargets.cmake
    │       │   └── OpenCLUtilsCpp/
    │       │       ├── OpenCLUtilsCppConfig.cmake
    │       │       ├── OpenCLUtilsCppConfigVersion.cmake
    │       │       ├── OpenCLUtilsCppTargets-debug.cmake
    │       │       ├── OpenCLUtilsCppTargets-release.cmake
    │       │       └── OpenCLUtilsCppTargets.cmake
    │       └── pkgconfig/
    │           ├── OpenCL-CLHPP.pc
    │           └── OpenCL-Headers.pc
    ├── atomic_latency_kernel.cl
    ├── svm.sln
    ├── svm.vcxproj
    ├── svm.vcxproj.filters
    └── svmtest.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/linux.yaml
================================================
name: Build Benchmarks on Ubuntu
on: [push]
jobs:
  BuildBenchmarks:
    # Only Ubuntu for now.
    runs-on: ubuntu-latest
    steps:
      - name: Install prerequisites
        run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip
      - name: Wild tomfoolery attempt
        run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && brew install mingw-w64
      - name: Check out repository code
        uses: actions/checkout@v3
      - name: Build all benchmarks
        run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && make ci
      - name: Package benchmarks
        run: make package
      - name: b3sum
        run: b3sum clammarks.txz
#      - name: Upload package
#        env:
#          UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }}
#          UPLOAD_URL: ${{ secrets.UPLOAD_URL }}
#        run:  curl -X PUT -T clammarks.txz -H "$UPLOAD_KEY" "$UPLOAD_URL"


================================================
FILE: .gitignore
================================================
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore

# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
*.swp
*generatednasm*
*.exe
MemoryBandwidth/membw_*
MemoryLatency/MemoryLatency

# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs

# Mono auto generated files
mono_crash.*

# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
clammicrobench/*.asm

# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/

# Visual Studio 2017 auto generated files
Generated\ Files/

# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*

# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml

# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c

# Benchmark Results
BenchmarkDotNet.Artifacts/

# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/

# ASP.NET Scaffolding
ScaffoldingReadMe.txt

# StyleCop
StyleCopReport.xml

# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.log
*.tlog
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc

# Chutzpah Test files
_Chutzpah*

# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb

# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap

# Visual Studio Trace Files
*.e2e

# TFS 2012 Local Workspace
$tf/

# Guidance Automation Toolkit
*.gpState

# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user

# TeamCity is a build add-in
_TeamCity*

# DotCover is a Code Coverage Tool
*.dotCover

# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json

# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info

# Visual Studio code coverage results
*.coverage
*.coveragexml

# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*

# MightyMoose
*.mm.*
AutoTest.Net/

# Web workbench (sass)
.sass-cache/

# Installshield output folder
[Ee]xpress/

# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html

# Click-Once directory
publish/

# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj

# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/

# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets

# Nuget personal access tokens and Credentials
nuget.config

# Microsoft Azure Build Output
csx/
*.build.csdef

# Microsoft Azure Emulator
ecf/
rcf/

# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload

# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/

# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs

# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk

# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/

# RIA/Silverlight projects
Generated_Code/

# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak

# SQL Server files
*.mdf
*.ldf
*.ndf

# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl

# Microsoft Fakes
FakesAssemblies/

# GhostDoc plugin setting file
*.GhostDoc.xml

# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/

# Visual Studio 6 build log
*.plg

# Visual Studio 6 workspace options file
*.opt

# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw

# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions

# Paket dependency manager
.paket/paket.exe
paket-files/

# FAKE - F# Make
.fake/

# CodeRush personal settings
.cr/personal

# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc

# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config

# Tabs Studio
*.tss

# Telerik's JustMock configuration file
*.jmconfig

# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs

# OpenCover UI analysis results
OpenCover/

# Azure Stream Analytics local run output
ASALocalRun/

# MSBuild Binary and Structured Log
*.binlog

# NVidia Nsight GPU debugger configuration file
*.nvuser

# MFractors (Xamarin productivity tool) working folder
.mfractor/

# Local History for Visual Studio
.localhistory/

# BeatPulse healthcheck temp database
healthchecksdb

# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/

# Ionide (cross platform F# VS Code tools) working folder
.ionide/

# Fody - auto-generated XML schema
FodyWeavers.xsd

# VS Code files for those working on multiple tools
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace

# Local History for Visual Studio Code
.history/

# Windows Installer files from build outputs
*.cab
*.msi
*.msix
*.msm
*.msp

# JetBrains Rider
.idea/
*.sln.iml


================================================
FILE: AsmGen/AsmGen.csproj
================================================
<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net8.0</TargetFramework>
    <Prefer32Bit>false</Prefer32Bit>
    <PlatformTarget>x64</PlatformTarget>
    <Platforms>AnyCPU;x64</Platforms>
  </PropertyGroup>

  <ItemGroup>
    <None Update="$([System.IO.Path]::Combine('Datafiles','clammicrobench.vcxproj_template'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
    <None Update="$([System.IO.Path]::Combine('Datafiles','GccBranchHistFunction.c'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
    <None Update="$([System.IO.Path]::Combine('Datafiles','IndirectBranchTestBlock.c'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
    <None Update="$([System.IO.Path]::Combine('Datafiles','BranchhistTestBlock.c'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
    <None Update="$([System.IO.Path]::Combine('Datafiles','GccIndirectBranchFunction.c'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
    <None Update="$([System.IO.Path]::Combine('Datafiles','CommonFunctions.c'))">
      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
    </None>
  </ItemGroup>
</Project>


================================================
FILE: AsmGen/AsmGen.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.2.32516.85
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AsmGen", "AsmGen.csproj", "{B8930E86-946C-4831-B088-F571E73EEDC4}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|Any CPU = Debug|Any CPU
		Debug|x64 = Debug|x64
		Release|Any CPU = Release|Any CPU
		Release|x64 = Release|x64
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64
		{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE}
	EndGlobalSection
EndGlobal


================================================
FILE: AsmGen/DataFiles/BranchhistTestBlock.c
================================================
﻿uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int);
initializeBranchHistFuncArr();
srand(time(NULL));

size_t resultSize = sizeof(float) * maxBranchCount * testSizeCount;
float* randomResults = (float*)malloc(resultSize);
float* predictableResults = (float*)malloc(resultSize);
for (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) {
    for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
        uint32_t testSize = branchHistoryLengths[testSizeIdx];
        uint32_t branchCount = branchCounts[branchCountIdx];
        printf("Testing branch count %d history length %d\n", branchCount, testSize);
        randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1);
        predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0);
        printf("%d, %f, %f\n", testSize,
            randomResults[branchCountIdx * testSizeCount + testSizeIdx],
            predictableResults[branchCountIdx * testSizeCount + testSizeIdx]);
    }
}

printf("Random:\n");
printResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);
printf("\nPredictable:\n");
printResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);

free(randomResults);
free(predictableResults);


================================================
FILE: AsmGen/DataFiles/CommonFunctions.c
================================================
﻿// this is a partial C file that's appended into generated code
// stuff here is generic enough to work for both windows/vs and gcc

#ifndef __MINGW32__
// optional affinity setting for effed up qualcomm/android bs
#include <sched.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <pthread.h>

void setAffinity(int core) {
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(core, &cpuset);
    printf("Set affinity to core %d\n", core);
    // sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
}
#endif

struct ThreadData {
    int* A;
    int* B;
    float* fpArr;
    uint32_t list_size;
    uint64_t structIterations;
};

void printCsvHeader(uint32_t* xCounts, uint32_t xLen) {
    printf("x");
    for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) {
        printf(", %d", xCounts[testSizeIdx]);
    }

    printf("\n");
}

// print results in format that excel can take
void printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) {
    uint32_t testSizeCount = xLen;
    printCsvHeader(xCounts, xLen);
    for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) {
        // row header
        printf("%d", yCounts[branchCountIdx]);
        for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
            printf(",%f", arr[branchCountIdx * testSizeCount + testSizeIdx]);
        }

        printf("\n");
    }
}

void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

================================================
FILE: AsmGen/DataFiles/GccBranchHistFunction.c
================================================
﻿// this is a partial C file that's appended into generated code

// Run a test, return the result in time (ns) per branch
// historyLen: length of random array that the test loops through
// branchCountIdx: index into array of branch counts, max determined by generated header/asm
// random: if 1, randomize test array contents. If 0, fill with zeroes
float runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t branchCount = branchCounts[branchCountIdx];
    uint64_t iterations = 320000000 / branchCount;
    uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx];
    float onesCount = 0.0f;

    uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);
    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {
        uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen);
        for (uint32_t i = 0; i < historyLen; i++) {
            testArr[i] = random ? rand() % 2 : 0;
            if (testArr[i] > 0)
            {
                onesCount += 1.0f;
            }
        }
        testArrToArr[testArrIdx] = testArr;
    }

    fprintf(stderr, "Starting test, should have %0.2f percent ones\n", onesCount / ((float)historyLen * branchCount));
    gettimeofday(&startTv, &startTz);
    uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen);
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)iterations;

    // give result in latency per branch
    latency = latency / branchCount;
    fprintf(stderr, "History length %u, branch count %u: %0.2f percent not-taken\n", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount));

    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);
    free(testArrToArr);
    return latency;
}


================================================
FILE: AsmGen/DataFiles/GccIndirectBranchFunction.c
================================================
﻿// similar but for indirect branch test
// needs indirectBranchTestFuncArr generated
// mode:
// 0 - cycle through targets
// 1 - random target selection
// 2 - jump to middle
float runIndirectBranchTest(uint32_t branchCountIdx, uint32_t targetCountIdx, uint32_t mode) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t branchCount = indirectBranchCounts[branchCountIdx];
    uint32_t targetCount = indirectBranchTargetCounts[targetCountIdx];
    uint64_t iterations = 80000000 / branchCount;
    uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t, uint64_t **) __attribute((sysv_abi)) = indirectBranchTestFuncArr[branchCountIdx][targetCountIdx];

    // generate an array containing jump target indexes for every branch
    uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);
    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {
        uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * targetCount);
        if (mode == 1)
            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = rand() % targetCount;
        else if (mode == 0)
            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = i;
        else if (mode == 2)
            for (uint32_t i = 0; i < targetCount; i++) testArr[i] = targetCount / 2;
        testArrToArr[testArrIdx] = testArr;
    }

    // each branch needs a jump table
    uint64_t** jumpTables = (uint64_t**)malloc(sizeof(uint64_t*) * branchCount);
    for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++)
    {
        uint64_t* jumpTable = (uint64_t*)malloc(sizeof(uint64_t) * targetCount);
        jumpTables[jumpTableIdx] = jumpTable;
    }

    gettimeofday(&startTv, &startTz);
    // uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch
    branchtestFunc(iterations, testArrToArr, targetCount, jumpTables);
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)iterations;

    // give result in latency per branch
    latency = latency / branchCount;

    for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);
    free(testArrToArr);
    for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++) free(jumpTables[jumpTableIdx]);
    free(jumpTables);
    return latency;
}


================================================
FILE: AsmGen/DataFiles/IndirectBranchTestBlock.c
================================================
﻿// generated code will have:
// - indirectBranchTargetCounts = array containing # of targets per branch
// - indirectBranchCounts = array containing # of branches to test
// - maxIndirectBranchCount = length of ^^
// - initializeIndirectBranchFuncArr = populates

uint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int);
initializeIndirectBranchFuncArr();
srand(time(NULL));

size_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount;
float* results = (float*)malloc(resultSize);
float* refResults = (float*)malloc(resultSize);
for (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) {
    for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) {
        uint32_t testSize = indirectBranchTargetCounts[targetCountIdx];
        uint32_t branchCount = indirectBranchCounts[branchCountIdx];
        printf("Testing branch count %d target count %d:", branchCount, testSize);
        results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0);
        refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2);
        printf("%f ns, reference %f ns\n",
            results[branchCountIdx * testSizeCount + targetCountIdx],
            refResults[branchCountIdx * testSizeCount + targetCountIdx]);
    }
}

printf("Indirect branch results:\n");
printResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);
printf("Reference indirect branch results:\n");
printResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);

free(results);
free(refResults);


================================================
FILE: AsmGen/DataFiles/clammicrobench.vcxproj_template
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{7e8cf2ba-57a7-4b42-b721-97e02bf9a8b8}</ProjectGuid>
    <RootNamespace>clammicrobench</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="clammicrobench.cpp" />
  </ItemGroup>
%REPLACEWITHCUSTOMBUILD%
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>


================================================
FILE: AsmGen/IUarchTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public interface IUarchTest
    {
        public const string ThreadLaunchFunctionPrefix = "ThreadLaunch_";
        // enough to generate global lines, function calls, and let user pick from tests
        public string Prefix { get; }
        public string Description { get; }
        public bool DivideTimeByCount { get; }
        public bool SupportsIsa(ISA isa);

        public void GenerateAsm(StringBuilder sb, ISA isa);
        public void GenerateTestBlock(StringBuilder sb, ISA isa);
        public void GenerateAsmGlobalLines(StringBuilder sb);
        public void GenerateExternLines(StringBuilder sb);

        public enum ISA
        {
            amd64,   // 64-bit x86
            aarch64, // 64-bit arm
            mips64,   // 64-bit MIPS, for loongson
            riscv,   // 64-bit risc-v
        }
    }
}


================================================
FILE: AsmGen/Program.cs
================================================
﻿using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading.Tasks;

namespace AsmGen
{
    class Program
    {
        public static string DataFilesDir = "DataFiles";

        static int structTestIterations = 5000000;
        static int iterations = 100 * structTestIterations;
        static int latencyListSize = 131072 * 1024 / 4; // 128 MB

        static void Main(string[] args)
        {
            List<IUarchTest> tests = new List<IUarchTest>();
            tests.Add(new BtbTest(4, BtbTest.BranchType.Unconditional));
            tests.Add(new BtbTest(8, BtbTest.BranchType.Unconditional));
            tests.Add(new BtbTest(16, BtbTest.BranchType.Unconditional));
            tests.Add(new BtbTest(32, BtbTest.BranchType.Unconditional));
            tests.Add(new BtbTest(64, BtbTest.BranchType.Unconditional));
            tests.Add(new BtbTest(4, BtbTest.BranchType.Conditional));
            tests.Add(new BtbTest(8, BtbTest.BranchType.Conditional));
            tests.Add(new BtbTest(16, BtbTest.BranchType.Conditional));
            tests.Add(new BtbTest(32, BtbTest.BranchType.Conditional));
            tests.Add(new BranchHistoryTest());

            List<Task> tasks = new List<Task>();
            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.amd64)));
            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.aarch64)));
            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.mips64)));
            tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.riscv)));

            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.amd64)));
            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.aarch64)));
            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.mips64)));
            tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.riscv)));
            Task.WaitAll(tasks.ToArray());

            GenerateMakefile();
        }

        static void GenerateCFile(List<IUarchTest> tests, IUarchTest.ISA isa)
        {
            StringBuilder sb = new StringBuilder();
            sb.AppendLine("#define _GNU_SOURCE");
            sb.AppendLine("#include <stdio.h>\n#include<stdint.h>\n#include<sys/time.h>\n#include <stdlib.h>\n#include <string.h>\n#include <time.h>\n");
            sb.AppendLine("#pragma GCC diagnostic ignored \"-Wattributes\"");
            string commonFunctions = File.ReadAllText(Path.Combine(DataFilesDir, "CommonFunctions.c"));
            sb.AppendLine(commonFunctions);

            foreach (IUarchTest test in tests)
            {
                if (test.SupportsIsa(isa))
                {
                    test.GenerateExternLines(sb);
                    Console.WriteLine("Test " + test.Prefix + " supports ISA " + isa);
                }
            }

            // no indexed addressing mode on these architectures, so make sure we can do pointer
            // chasing with a single instruction
            if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)
            {
                sb.AppendLine("extern void preplatencyarr(int *arr, uint32_t list_size);");
            }

            AddCommonInitCode(sb, tests, isa);
            foreach (IUarchTest test in tests)
            {
                if (test.SupportsIsa(isa)) test.GenerateTestBlock(sb, isa);
            }

            AddCommonEndCode(sb);

            File.WriteAllText("clammicrobench_" + isa.ToString() + ".c", sb.ToString());
        }

        static void GenerateAsmFile(List<IUarchTest> tests, IUarchTest.ISA isa)
        {
            string filename = "clammicrobench_" + isa.ToString() + ".s";
            StringBuilder sb = new StringBuilder();
            sb.AppendLine(".text");

            if (isa == IUarchTest.ISA.mips64)
            {
                UarchTest.GenerateMipsPrepArrayFunction(sb);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                UarchTest.GenerateRiscvPrepArrayFunction(sb);
            }

            File.WriteAllText(filename, sb.ToString());
            sb.Clear();

            foreach (IUarchTest test in tests)
            {
                if (test.SupportsIsa(isa))
                {
                    sb.Clear();
                    test.GenerateAsmGlobalLines(sb);
                    test.GenerateAsm(sb, isa);
                    File.AppendAllText(filename, sb.ToString());
                }
            }
        }

        static void GenerateMakefile()
        {
            StringBuilder sb = new StringBuilder();
            foreach (IUarchTest.ISA isa in Enum.GetValues(typeof(IUarchTest.ISA)))
            {
                sb.AppendLine(isa.ToString() + ":");
                if (isa == IUarchTest.ISA.aarch64)
                {
                    sb.AppendLine($"\tgcc -march=armv8.5-a+aes clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb -static");
                    // hack for stupid compilers that need a ton of flags to do basic things
                    sb.AppendLine("android:");
                    sb.AppendLine("\tclang -march=armv8.3-a -mfpu=neon-fp-armv8 clammicrobench_aarch64.c clammicrobench_aarch64.s -o cb");
                }
                else sb.AppendLine($"\tgcc -pthread clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb");
            }

            sb.AppendLine("win64:");
            sb.AppendLine($"\tx86_64-w64-mingw32-gcc clammicrobench_{IUarchTest.ISA.amd64.ToString()}.c clammicrobench_{IUarchTest.ISA.amd64.ToString()}.s -o cb.exe");

            sb.AppendLine("clean:");
            sb.AppendLine("\trm clammicrobench_* cb");

            File.WriteAllText("Makefile", sb.ToString());
        }

        // Adds largely ISA independent initialization code that gives tests a basic foundation,
        // like a pointer chasing array
        static void AddCommonInitCode(StringBuilder sb, List<IUarchTest> tests, IUarchTest.ISA isa)
        {
            sb.AppendLine("int main(int argc, char *argv[]) {");
            sb.AppendLine($"  uint64_t time_diff_ms, iterations = {iterations}, structIterations = {structTestIterations}, tmp;");
            sb.AppendLine("  double latency; int *A = NULL, *B = NULL; float *fpArr = NULL; char *test_name = NULL; int core_affinity = -1; int threads = 1;");
            sb.AppendLine("  uint64_t tmpsink;");
            sb.AppendLine("  uint32_t list_size = " + latencyListSize + ";");

            // print a help message based on tests available
            sb.AppendLine($"  printf(\"Usage: -test [test name] -listsize [latency list size = {latencyListSize}] -iterations [struct iterations = {structTestIterations}]\\n\");");
            sb.AppendLine("  if (argc < 2) {");
            sb.AppendLine("    printf(\"List of tests:\\n\");");
            foreach (IUarchTest test in tests)
            {
                if (test.SupportsIsa(isa)) sb.AppendLine($"    printf(\"  {test.Prefix} - {test.Description}\\n\");");
            }

            // args provided. parse them and run test
            sb.AppendLine("  } else {");

            // args handling
            sb.AppendLine("    for (int argIdx = 1; argIdx < argc; argIdx++) {");
            sb.AppendLine("      if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1;");
            sb.AppendLine("        if (strncmp(arg, \"test\", 4) == 0) { argIdx++; test_name = argv[argIdx]; }");
            sb.AppendLine("        if (strncmp(arg, \"iterations\", 10) == 0) { argIdx++; iterations = 100 * atoi(argv[argIdx]); }");
            sb.AppendLine("        if (strncmp(arg, \"listsize\", 8) == 0) { argIdx++; list_size = atoi(argv[argIdx]); }");
            sb.AppendLine("        if (strncmp(arg, \"affinity\", 8) == 0) { argIdx++; core_affinity = atoi(argv[argIdx]); }");
            sb.AppendLine("        if (strncmp(arg, \"threads\", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); }");
            sb.AppendLine("      }"); // end -arg handling if
            sb.AppendLine("    }"); // end args handling for loop

            sb.AppendLine("    if (test_name == NULL) { fprintf(stderr, \"No test specified\\n\"); return 0; }");

            // Optional affinity setting for certain troublesome platforms
            // don't need a version that uses Windows affinity APIs because Windows platforms never have this issue
            sb.AppendLine("#ifndef __MINGW32__");
            sb.AppendLine("  if (core_affinity != -1) setAffinity(core_affinity);");
            sb.AppendLine("#endif");

            // Generate array for pointer chasing unless we're doing a BTB test
            sb.AppendLine("  if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {");
            GenerateLatencyTestArray(sb);
            sb.AppendLine("  }"); // end of ptr chasing array generation
            sb.AppendLine("  struct timeval startTv, endTv;");
            sb.AppendLine("  struct timezone startTz, endTz;");
        }


        static void AddCommonEndCode(StringBuilder sb)
        {
            sb.AppendLine("  free(A); free(B); free(fpArr);");
            sb.AppendLine("  }"); // end else
            sb.AppendLine("  return 0; }");
        }

        static void GenerateLatencyTestArray(StringBuilder sb)
        {
            // Fill list to create random access pattern
            sb.AppendLine("  A = (int*)malloc(sizeof(int) * list_size);");
            sb.AppendLine("  srand(time(NULL));");
            sb.AppendLine("  FillPatternArr(A, list_size, 64);\n");

            sb.AppendLine("#ifdef _WIN32");
            sb.AppendLine("  B = (int*)_aligned_malloc(sizeof(int) * list_size, 64);\n");
            sb.AppendLine("#else");
            sb.AppendLine("  posix_memalign((void **)&B, 64, sizeof(int) * list_size);\n");
            sb.AppendLine("#endif");
            sb.AppendLine("  for (int i = 0; i < list_size; i++) { B[i] = i; }\n");
            sb.AppendLine("#ifdef _WIN32");
            sb.AppendLine("  fpArr = (float*)_aligned_malloc(sizeof(float) * list_size, 64);\n");
            sb.AppendLine("#else");
            sb.AppendLine("  posix_memalign((void **)&fpArr, 64, sizeof(float) * list_size);");
            sb.AppendLine("#endif");
            sb.AppendLine("  for (int i = 0;i < list_size; i++) { fpArr[i] = i + .1; }\n");
        }
    }
}


================================================
FILE: AsmGen/Properties/launchSettings.json
================================================
{
  "profiles": {
    "AsmGen": {
      "commandName": "Project",
      "commandLineArgs": "autocopy"
    }
  }
}


================================================
FILE: AsmGen/README.md
================================================
# Microbenchmark Generator
C# project to generate C and assembly for CPU structure size benchmarks that use different code for each data point, making them
impractical to write by hand. For more details on methodology for out-of-order structure size measurement, see https://blog.stuffedcow.net/2013/05/measuring-rob-capacity/

First, go to Program.cs and set the expected sizes for the structures you want to measure. The constructor for each test generally has the same (low, high, step) format. For example if you anticipate ROB capacity will be between 128 and 512 entries, you can do `tests.Add(new RobTest(128, 1, 512))`

# Building

Compile the project and run AsmGen.exe. That gives several output files. Compilation for Linux:
`gcc clammicrobench.c clammicrobench_x86.s -o clammicrobench` for x86_64
`gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` for aarch64
`aarch64-linux-gnu-gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` to cross compile for aarch64 (for example from a fast desktop)

For Windows, run `AsmGen.exe autocopy`. That copies generated files to the /clammicrobench directory, assuming it's run from the default VS output location. Then, open /clammicrobench/clammicrobench.sln and build. You need nasm in your path for that, as covered on README.md at repo root.

The indirect branch test can take a while to build with nasm, so you might want to reduce the branch and target counts for that. Or just keep it commented out.

# Running
Generally, the syntax is `clammicrobench -test [test name] -listsize [list size for latency test] -iterations [iteration count]`. The last two parameters are optional.

# Tests

Running the program without parameters will spit out a list of tests and brief descriptions. Most are structure size tests. Instructions that consume certain core resources are placed between two pointer chasing loads. Once the two cache misses can't overlap, the structure being tested is full. Some tests, especially those measuring scheduler capacity, will hit a mix of instructions to see whether capacity is shared across different categories of instructions. 

Alongside structure size tests, AsmGen is a convenient place to put other microbenchmarks that involve generating tons of code. There are several branch predictor tests:
- btb16Unconditional, etc: Creates a chain of taken branches in a loop to measure taken branch latency. Useful for showing BTB size and speed. Different distances between branches are useful because branch predictors sometimes have dtrouble tracking branches that are too close together.
- btb16Conditional: Same as above but with always-taken conditional branches
- branchhist - Branch history test: Generates branches that are taken or not taken in some random pattern, then increases the length of that pattern and the number of branches. Each branch is given its own pattern. This test thus tries to see how long of a pattern the branch predictor can track before getting a lot of mispredicts.
- indirectbranch - Indirect branch prediction test: Varies the number of branch targets and branches to see how many total targets the indirect branch predictor can track
- returnstack - Tests return prediction with a nested calls of varying depths. When the return stack overflows, you'll see an increase in time per call/return pair.


================================================
FILE: AsmGen/UarchTest.cs
================================================
﻿using System.Runtime.Serialization;
using System.Text;

namespace AsmGen
{
    public abstract class UarchTest : IUarchTest
    {
        public string Prefix { get; set; }

        public string Description { get; set; }

        public int[] Counts;

        public string FunctionDefinitionParameters { get; set; }

        public string GetFunctionCallParameters { get; set; }

        public bool DivideTimeByCount { get; set; }

        public abstract bool SupportsIsa(IUarchTest.ISA isa);
        public abstract void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa);

        public void GenerateAsmGlobalLines(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
                sb.AppendLine(".global " + Prefix + Counts[i]);
        }

        public void GenerateExternLines(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
            {
                sb.AppendLine("extern uint64_t " + Prefix + Counts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");

                // Function that can be launched in a pthread
                sb.AppendLine($"void *{IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}(void *pa)");
                sb.AppendLine("{");
                sb.AppendLine("    struct ThreadData *td = (struct ThreadData *)pa;");
                sb.AppendLine("    int *A = td->A;");
                sb.AppendLine("    int *B = td->B;");
                sb.AppendLine("    float *fpArr = td->fpArr;");
                sb.AppendLine("    uint32_t list_size = td->list_size;");
                sb.AppendLine("    int structIterations = td->structIterations;");
                sb.AppendLine("    " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
                sb.AppendLine("    return NULL;");
                sb.AppendLine("}");
            }
        }

        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
        {
            sb.AppendLine("  if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {");
            sb.AppendLine("    printf(\"" + Description + ":\\n\");");

            if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)
            {
                sb.AppendLine("  if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {");
                sb.AppendLine("preplatencyarr(A, list_size);");
                sb.AppendLine("  }");
            }

            for (int i = 0; i < Counts.Length; i++)
            {
                // use more iterations (iterations = structIterations * 100) and divide iteration count by tested-thing count
                // for certain tests like call stack depth
                if (DivideTimeByCount)
                {
                    sb.AppendLine("    tmp = structIterations;");
                    sb.AppendLine("    structIterations = iterations / " + Counts[i] + ";");
                }

                sb.AppendLine("    gettimeofday(&startTv, &startTz);");
                sb.AppendLine("#ifndef __MINGW32__");
                sb.AppendLine("    if (threads > 1) {");
                sb.AppendLine("        struct ThreadData testThreadData;");
                sb.AppendLine("        pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));");
                sb.AppendLine("        testThreadData.A = A;");
                sb.AppendLine("        testThreadData.B = B;");
                sb.AppendLine("        testThreadData.fpArr = fpArr;");
                sb.AppendLine("        testThreadData.list_size = list_size;");
                sb.AppendLine("        testThreadData.structIterations = structIterations;");
                sb.AppendLine("        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {");
                sb.AppendLine($"            pthread_create(testThreads + threadIdx, NULL, {IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}, &testThreadData);");
                sb.AppendLine("        }");
                sb.AppendLine("        for (int threadIdx = 0; threadIdx < threads; threadIdx++) {");
                sb.AppendLine("             pthread_join(testThreads[threadIdx], NULL);");
                sb.AppendLine("        }");
                sb.AppendLine("        free(testThreads);");
                // launch threads
                sb.AppendLine("    } else ");
                sb.AppendLine("        " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
                sb.AppendLine("#else");
                sb.AppendLine("    " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
                sb.AppendLine("#endif");
                sb.AppendLine("    gettimeofday(&endTv, &endTz);");
                sb.AppendLine("    time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);");
                //sb.AppendLine("    fprintf(stderr, \"%lu ms elapsed, %lu iter\\n\", time_diff_ms, structIterations);");
                if (DivideTimeByCount)
                    sb.AppendLine("    latency = 1e6 * (float)time_diff_ms / (float)(iterations);");
                else
                    sb.AppendLine("    latency = 1e6 * (float)time_diff_ms / (float)(structIterations);");
                sb.AppendLine("    printf(\"" + Counts[i] + ",%f\\n\", latency);\n");

                if (DivideTimeByCount)
                {
                    sb.AppendLine("    structIterations = tmp;");
                }
            }

            sb.AppendLine("  }\n");
        }

        /// <summary>
        /// MIPS doesn't have an indexed load instruction which means we'd have to use an
        /// add+shift (extra two instructions), which would complicate measurements
        /// So screw around in order to use direct addressing
        /// </summary>
        /// <param name="sb"></param>
        public static void GenerateMipsPrepArrayFunction(StringBuilder sb)
        {
            // r4 = ptr to arr, r5 = arr len, in 32-bit elements
            sb.AppendLine(".global preplatencyarr");
            sb.AppendLine("preplatencyarr:");
            sb.AppendLine("  xor $r12, $r12, $r12");
            sb.AppendLine("  xor $r13, $r13, $r13");
            sb.AppendLine("  xor $r14, $r14, $r14");
            sb.AppendLine("  xor $r15, $r15, $r15"); // array index
            sb.AppendLine("  addi.d $r14, $r14, 1");
            sb.AppendLine("preplatencyarr_loop:");
            sb.AppendLine("  alsl.d $r12, $r15, $r0, 0x3"); // shift by 3 = multiply by 8 for 64-bit
            sb.AppendLine("  add.d $r12, $r4, $r12"); // add loaded value to base address
            sb.AppendLine(" ld.d $r13, $r12, 0");
            sb.AppendLine("  alsl.d $r13, $r13, $r0, 0x2"); // address calculation for loaded index. this is in 32-bit values
            sb.AppendLine("  add.d $r13, $r4, $r13");
            sb.AppendLine("  st.d $r13, $r12, 0");  // save calculated address
            sb.AppendLine("  add.d $r15, $r15, $r14");
            sb.AppendLine("  alsl.d $r16, $r15, $r0, 0x1"); // muliply 64-bit index by 2 to prevent out of bounds for 32-bit list size count
            sb.AppendLine("  bne $r16, $r5, preplatencyarr_loop"); // while idx != len
            sb.AppendLine("  jr $r1");
        }

        public static void GenerateRiscvPrepArrayFunction(StringBuilder sb)
        {
            sb.AppendLine(".global preplatencyarr");
            sb.AppendLine("preplatencyarr:");
            sb.AppendLine("  li x7, 0");
            sb.AppendLine("  mv x5, x10");
            sb.AppendLine("preplatencyarr_loop:");
            sb.AppendLine("  ld x28, (x5)");
            sb.AppendLine("  slli x28, x28, 2"); // index specified in 32-bit values
            sb.AppendLine("  add x28, x28, x10");
            sb.AppendLine("  sd x28, (x5)");
            sb.AppendLine("  addi x5, x5, 8"); // next element
            sb.AppendLine("  addi x7, x7, 2"); // list size is given in 32-bit elements
            sb.AppendLine("  blt x7, x11, preplatencyarr_loop");
            sb.AppendLine("  ret");
        }
    }
}


================================================
FILE: AsmGen/UarchTestHelpers.cs
================================================
﻿using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace AsmGen
{
    public static class UarchTestHelpers
    {
        public static int[] GenerateCountArray(int low, int high, int step)
        {
            List<int> countList = new List<int>();
            for (int i = low; i <= high; i += step)
            {
                countList.Add(i);
            }

            return countList.ToArray();
        }

        public static void GenerateNasmGlobalLines(StringBuilder sb, UarchTest test)
        {
            int[] counts = test.Counts;
            for (int i = 0; i < counts.Length; i++)
                sb.AppendLine("global " + test.Prefix + counts[i]);
        }

        public static void GenerateAsmGlobalLines(StringBuilder sb, UarchTest test)
        {
            int[] counts = test.Counts;
            for (int i = 0; i < counts.Length; i++)
                sb.AppendLine(".global " + test.Prefix + counts[i]);
        }

        public static void GenerateExternLines(StringBuilder sb, UarchTest test)
        {
            int[] counts = test.Counts;
            for (int i = 0; i < counts.Length; i++)
                sb.AppendLine("extern uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters}) __attribute((sysv_abi));"); ;
        }

        public static void GenerateVsExternLines(StringBuilder sb, UarchTest test)
        {
            int[] counts = test.Counts;
            for (int i = 0; i < counts.Length; i++)
                sb.AppendLine("extern \"C\" uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters});");
        }

        /// <summary>
        /// Generates test functions in assembly, with filler instructions between two divs
        /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention
        /// </summary>
        /// <param name="sb">StringBuilder to append to</param>
        /// <param name="counts">Sizes to test the structure at</param>
        /// <param name="funcNamePrefix">Function name prefix</param>
        /// <param name="fillerInstrs1">Filler instructions after first ptr chasing load</param>
        /// <param name="fillerInstrs2">Filler instructions after second ptr chasing load</param>
        /// <param name="includePtrChasingLoads">If true, count pointer chasing loads as consuming the tested resource
        /// (i.e. ptr chasing loads consume a ROB and integer RF slot) </param>
        /// <param name="initInstrs">Any extra initialization instructions</param>
        public static void GenerateX86AsmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %r9");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rcx, %r9"); // r9 <- rcx
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x10, %r14");
                sb.AppendLine("  mov $0x20, %r13");
                sb.AppendLine("  mov $0x30, %r12");
                sb.AppendLine("  mov $0x40, %r11");

                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  mov %rdx, %rdi");
                sb.AppendLine("  mov %rdx, %rsi");
                sb.AppendLine("\n" + funcName + "start:");

                // keep dividing list size by itself
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  mov %rdi, %rax");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  sub %rax, %rsi");
                sb.AppendLine("  inc %rsi");

                // rdx is the remainder, rax is the quotient
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  mov %rsi, %rax");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  sub %rax, %rdi");
                sb.AppendLine("  inc %rdi");
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs2[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r9");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        public static void GenerateX86AsmDivNsqTestFuncs(StringBuilder sb,
            int maxSize,
            int[] counts,
            string funcNamePrefix,
            string[] depInstrs,
            string[] indepInstrs,
            bool divsInSq = false,
            string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x10, %r14");
                sb.AppendLine("  mov $0x20, %r13");
                sb.AppendLine("  mov $0x30, %r12");
                sb.AppendLine("  mov $0x40, %r11");

                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  mov %rdx, %rdi");
                sb.AppendLine("  mov %rdx, %rsi");
                sb.AppendLine("\n" + funcName + "start:");

                // keep dividing list size by itself
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  mov %rdi, %rax");  // divide rdi by rsi
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rsi");
                sb.AppendLine("  sub %rax, %rsi");
                sb.AppendLine("  inc %rsi");

                // rdx is the remainder, rax is the quotient
                int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];
                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
                {
                    if (fillerIdx < fillerInstrCount)
                    {
                        sb.AppendLine(depInstrs[depInstrIdx]);
                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
                    }
                    else
                    {
                        sb.AppendLine(indepInstrs[indepInstrIdx]);
                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
                    }
                }

                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  mov %rsi, %rax");  // divide rsi by rdi
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  xor %rdx, %rdx");
                sb.AppendLine("  idiv %rdi");
                sb.AppendLine("  sub %rax, %rdi");
                sb.AppendLine("  inc %rdi");

                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
                {
                    if (fillerIdx < fillerInstrCount)
                    {
                        sb.AppendLine(depInstrs[depInstrIdx]);
                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
                    }
                    else
                    {
                        sb.AppendLine(indepInstrs[indepInstrIdx]);
                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
                    }
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        public static void GenerateX86AsmStructureTestFuncs(StringBuilder sb,
            int[] counts,
            string funcNamePrefix,
            string[] fillerInstrs1,
            string[] fillerInstrs2,
            bool includePtrChasingLoads = true,
            string initInstrs = null,
            string postLoadInstrs1 = null,
            string postLoadInstrs2 = null,
            bool lfence = true,
            string cleanupInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x2, %r13");
                sb.AppendLine("  mov $0x3, %r12");
                sb.AppendLine("  mov $0x4, %r11");

                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                if (lfence) sb.AppendLine("lfence");
                else
                {
                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
                    for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                    {
                        sb.AppendLine(fillerInstrs2[instrIdx]);
                        instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                    }
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                if (cleanupInstrs != null) sb.AppendLine(cleanupInstrs);
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Generate test functions to see how big a scheduler is, without a NSQ
        /// Dependent ops are followed by independent ops, total op count = max
        /// If number of dependent ops is greater than NSQ size, indep ops can't be executed and
        /// there will be a dispatch stall
        /// </summary>
        /// <param name="sb">Stringbuilder to append to</param>
        /// <param name="totalOps">number of ops between dependent loads. must be less than RF size but greater than SQ+NSQ size</param>
        /// <param name="counts">array of data points to test (SQ sizes in this case)</param>
        /// <param name="funcNamePrefix">function name prefix</param>
        /// <param name="dependentInstrs"></param>
        /// <param name="indepInstrs"></param>
        /// <param name="ptrChasingLoadsInSq">Do ptr chasing loads occupy entries in the SQ being measured?</param>
        public static void GenerateX86AsmNsqTestFuncs(StringBuilder sb,
            int totalOps,
            int[] counts,
            string funcNamePrefix,
            string[] dependentInstrs,
            string[] indepInstrs,
            bool ptrChasingLoadsInSq = false,
            string initInstrs = null,
            string postLoadInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x2, %r13");
                sb.AppendLine("  mov $0x3, %r12");
                sb.AppendLine("  mov $0x4, %r11");

                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);
                int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < totalOps; fillerIdx++)
                {
                    if (fillerIdx < sqInstrs)
                    {
                        sb.AppendLine(dependentInstrs[depInstrIdx]);
                        depInstrIdx = (depInstrIdx + 1) % dependentInstrs.Length;
                    }
                    else
                    {
                        sb.AppendLine(indepInstrs[indepInstrIdx]);
                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
                    }
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("  lfence");
                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Generate test functions for testing integer scheduler capacity
        /// R15's value is dependent on the pointer chasing load results
        /// </summary>
        /// <param name="sb"></param>
        /// <param name="counts"></param>
        /// <param name="funcNamePrefix"></param>
        /// <param name="fillerInstrs1"></param>
        /// <param name="fillerInstrs2"></param>
        /// <param name="divs"></param>
        /// <param name="initInstrs"></param>
        public static void GenerateX86AsmIntSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool divs = true, string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x2, %r13");
                sb.AppendLine("  mov $0x3, %r12");
                sb.AppendLine("  mov $0x4, %r11");

                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov %rdi, %r15");
                int fillerInstrCount = divs ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("  mov %rsi, %r15");
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs2[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Generates pointer chasing test functions in assembly, with xmm0 <- [address using offset from ptr chasing result]
        /// xmm1-4 can be used for
        /// </summary>
        /// <param name="sb"></param>
        /// <param name="counts"></param>
        /// <param name="funcNamePrefix"></param>
        /// <param name="fillerInstrs1"></param>
        /// <param name="fillerInstrs2"></param>
        public static void GenerateX86AsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x1, %r13");
                sb.AppendLine("  mov $0x3, %r12");

                // initialize some FP values off r8 (third argument)
                sb.AppendLine("  movss (%r8), %xmm1");
                sb.AppendLine("  movss 4(%r8), %xmm2");
                sb.AppendLine("  movss 8(%r8), %xmm3");
                sb.AppendLine("  movss 12(%r8), %xmm4");
                sb.AppendLine("  movss 16(%r8), %xmm5");

                // start one chain at 0, and the other at 0x40
                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  cvtsi2ss %rdi, %xmm0");
                int fillerInstrCount = counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("  cvtsi2ss %rsi, %xmm0");
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs2[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        public static void GenerateX86AsmFp256SchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x1, %r13");
                sb.AppendLine("  mov $0x3, %r12");

                // initialize some FP values off r8 (third argument)
                sb.AppendLine("  vzeroupper");
                sb.AppendLine("  vmovups (%r8), %ymm1");
                sb.AppendLine("  vmovups 32(%r8), %ymm2");
                sb.AppendLine("  vmovups 64(%r8), %ymm3");
                sb.AppendLine("  vmovups 96(%r8), %ymm4");
                sb.AppendLine("  vmovups 128(%r8), %ymm5");

                // start one chain at 0, and the other at 0x40
                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  vbroadcastss (%r8,%rdi,4), %ymm0");
                int fillerInstrCount = counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("  vbroadcastss (%r8,%rsi,4), %ymm0");
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs2[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Generates test functions in assembly, with filler instructions between two divs
        /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention
        /// </summary>
        /// <param name="sb">StringBuilder to append to</param>
        /// <param name="counts">Sizes to test the structure at</param>
        /// <param name="funcNamePrefix">Function name prefix</param>
        /// <param name="fillerInstrs1">Filler instructions after first ptr chasing load</param>
        /// <param name="fillerInstrs2">Filler instructions after second ptr chasing load</param>
        /// <param name="includePtrChasingLoads">If true, count pointer chasing loads as consuming the tested resource
        /// (i.e. ptr chasing loads consume a ROB and integer RF slot) </param>
        /// <param name="initInstrs">Any extra initialization instructions</param>
        public static void GenerateX86NasmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push rsi");
                sb.AppendLine("  push rdi");
                sb.AppendLine("  push r15");
                sb.AppendLine("  push r14");
                sb.AppendLine("  push r13");
                sb.AppendLine("  push r12");
                sb.AppendLine("  push r11");

                sb.AppendLine("  xor r15, r15");
                sb.AppendLine("  mov r14, 0x10");
                sb.AppendLine("  mov r13, 0x20");
                sb.AppendLine("  mov r12, 0x30");
                sb.AppendLine("  mov r11, 0x40");


                if (initInstrs != null) sb.AppendLine(initInstrs);

                sb.AppendLine("  mov rdi, rdx");
                sb.AppendLine("  mov rsi, rdx");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  mov rax, rdi");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rsi");
                sb.AppendLine("  sub rsi, rax");
                sb.AppendLine("  inc rsi");
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs1[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  mov rax, rsi");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  xor rdx, rdx");
                sb.AppendLine("  idiv rdi");
                sb.AppendLine("  sub rdi, rax");
                sb.AppendLine("  inc rdi");
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
                {
                    sb.AppendLine(fillerInstrs2[instrIdx]);
                    instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  dec rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop r11");
                sb.AppendLine("  pop r12");
                sb.AppendLine("  pop r13");
                sb.AppendLine("  pop r14");
                sb.AppendLine("  pop r15");
                sb.AppendLine("  pop rdi");
                sb.AppendLine("  pop rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Generates test functions in ARM assembly.
        /// Registers x15-x10 can be used for integer stuff
        /// Args are in x0, x1, x2
        /// </summary>
        /// <param name="sb"></param>
        /// <param name="counts"></param>
        /// <param name="funcNamePrefix"></param>
        /// <param name="fillerInstrs1"></param>
        /// <param name="fillerInstrs2"></param>
        /// <param name="includePtrChasingLoads"></param>
        /// <param name="dsb">use dsb as lfence</param>
        public static void GenerateArmAsmStructureTestFuncs(StringBuilder sb,
            int[] counts,
            string funcNamePrefix,
            string[] fillerInstrs1,
            string[] fillerInstrs2,
            bool includePtrChasingLoads = false,
            string initInstrs = null,
            string postLoadInstrs1 = null,
            string postLoadInstrs2 = null,
            bool dsb = true)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in x0, x1
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                sb.AppendLine("  mov x10, 6");
                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)
                {
                    sb.AppendLine(fillerInstrs1[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                if (dsb)
                {
                    sb.AppendLine("  dsb sy");
                    sb.AppendLine("  isb sy");
                }
                else
                {
                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
                    for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)
                    {
                        sb.AppendLine(fillerInstrs2[addIdx]);
                        addIdx = (addIdx + 1) % fillerInstrs2.Length;
                    }
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }

        public static void GenerateArmAsmNsqTestFuncs(StringBuilder sb,
            int totalOps,
            int[] counts,
            string funcNamePrefix,
            string[] dependentInstrs,
            string[] indepInstrs,
            bool ptrChasingLoadsInSq = false,
            string initInstrs = null,
            string postLoadInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in x0, x1
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                sb.AppendLine("  mov x10, 6");
                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);
                int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];
                for (int fillerIdx = 0, instrIdx = 0; fillerIdx < totalOps; fillerIdx++)
                {
                    if (fillerIdx < sqInstrs)
                        sb.AppendLine(dependentInstrs[instrIdx]);
                    else
                        sb.AppendLine(indepInstrs[instrIdx]);

                    instrIdx = (instrIdx + 1) % dependentInstrs.Length;
                }

                sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                sb.AppendLine("  dsb sy"); // close enough to lfence
                sb.AppendLine("  isb sy");
                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }

        /// <summary>
        /// Filler for todo functions
        /// </summary>
        /// <param name="sb"></param>
        /// <param name="counts"></param>
        /// <param name="funcNamePrefix"></param>
        public static void GenerateStub(StringBuilder sb, int[] counts, string funcNamePrefix)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  ret");
            }
        }

        public static void GenerateArmAsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
        {
            GenerateArmAsmStructureTestFuncs(sb,
                counts,
                funcNamePrefix,
                fillerInstrs1,
                fillerInstrs2,
                false,
                null,
                "  ldr s16, [x2, w25, uxtw #2]",
                "  ldr s16, [x2, w26, uxtw #2]");
        }

        public static void GenerateArmAsmDivStructureTestFuncs(StringBuilder sb,
            int[] counts,
            string funcNamePrefix,
            string[] fillerInstrs1,
            string[] fillerInstrs2,
            bool includePtrChasingLoads = false,
            string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in x0 = iterations, x1 = list size, x2 = list (sink)
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov w25, w1");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    sb.AppendLine(fillerInstrs1[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs1.Length;
                }

                sb.AppendLine("  mov w26, w1");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");

                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    sb.AppendLine(fillerInstrs2[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs2.Length;
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }

        // Just to deal with A73
        public static string GetArmDependentBranch(string prefix)
        {
            return $"  cmp x25, x26\n  b.eq {prefix}_badthing";
        }

        public static string GetArmDependentBranchTarget(string prefix)
        {
            return $"{prefix}_badthing:\n  .word 0xf7f0a000";
        }

        public static string GetRiscvDependentBranch(string prefix)
        {
            return $"  beq x5, x6, {prefix}_badthing";
        }

        public static string GetRiscvDependentBranchTarget(string prefix)
        {
            return $"{prefix}_badthing:\n  .word 0x00000000";
        }

        public static void GenerateArmAsmDivNsqTestFuncs(StringBuilder sb,
            int maxSize,
            int[] counts,
            string funcNamePrefix,
            string[] depInstrs,
            string[] indepInstrs,
            bool divsInSq = false,
            string initInstrs = null)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in x0 = iterations, x1 = list size, x2 = list (sink)
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov w25, w1");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                sb.AppendLine("  udiv w25, w25, w13");
                int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];
                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
                {
                    if (fillerIdx < fillerInstrCount)
                    {
                        sb.AppendLine(depInstrs[depInstrIdx]);
                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
                    }
                    else
                    {
                        sb.AppendLine(indepInstrs[indepInstrIdx]);
                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
                    }
                }
                sb.AppendLine("  mov w26, w1");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  udiv w26, w26, w13");
                sb.AppendLine("  mov w25, w26");

                for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
                {
                    if (fillerIdx < fillerInstrCount)
                    {
                        sb.AppendLine(depInstrs[depInstrIdx]);
                        depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
                    }
                    else
                    {
                        sb.AppendLine(indepInstrs[indepInstrIdx]);
                        indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
                    }
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }

        public static void GenerateMipsAsmStructureTestFuncs(StringBuilder sb,
            int[] counts,
            string funcNamePrefix,
            string[] fillerInstrs1,
            string[] fillerInstrs2,
            bool includePtrChasingLoads = false,
            string initInstrs = null,
            string postLoadInstrs1 = null,
            string postLoadInstrs2 = null,
            bool dsb = false)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in r4 = iterations, r5 = list, r6 = list (sink)
                // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  ld.d $r12, $r5, 0");
                sb.AppendLine("  ld.d $r13, $r5, 64");
                sb.AppendLine("  xor $r14, $r14, $r14");
                sb.AppendLine("  addi.d $r14, $r14, 1");
                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ld.d $r12, $r12, 0");
                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    sb.AppendLine(fillerInstrs1[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs1.Length;
                }
                sb.AppendLine("  ld.d $r13, $r13, 0");
                if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    sb.AppendLine(fillerInstrs2[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs2.Length;
                }
                sb.AppendLine("  sub.d $r4, $r4, $r14");
                sb.AppendLine("  bnez $r4, " + funcName + "start");
                sb.AppendLine(" jr $r1");
            }
        }

        public static void GenerateRiscvAsmStructureTestFuncs(StringBuilder sb,
            int[] counts,
            string funcNamePrefix,
            string[] fillerInstrs1,
            string[] fillerInstrs2,
            bool includePtrChasingLoads = false,
            string initInstrs = null,
            string postLoadInstrs1 = null,
            string postLoadInstrs2 = null,
            bool fence = true)
        {
            for (int i = 0; i < counts.Length; i++)
            {
                string funcName = funcNamePrefix + counts[i];

                // args in x10 = iterations, x11 = list, x12 = list (sink)
                // temporaries are x5-x7, x28-x31
                // x18-27 are to be saved
                // use x5 and x6 for ptr chasing loads
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  addi sp, sp, -88");
                sb.AppendLine("  sd x18, 0(sp)");
                sb.AppendLine("  sd x19, 8(sp)");
                sb.AppendLine("  sd x20, 16(sp)");
                sb.AppendLine("  sd x21, 24(sp)");
                sb.AppendLine("  sd x22, 32(sp)");
                sb.AppendLine("  sd x23, 40(sp)");
                sb.AppendLine("  sd x24, 48(sp)");
                sb.AppendLine("  sd x25, 56(sp)");
                sb.AppendLine("  sd x26, 64(sp)");
                sb.AppendLine("  sd x27, 72(sp)");

                sb.AppendLine("  addi x28, x28, 1");
                sb.AppendLine("  addi x29, x29, 1");
                sb.AppendLine("  addi x30, x30, 1");
                sb.AppendLine("  addi x31, x31, 1");
                sb.AppendLine("  addi x18, x18, 2");
                sb.AppendLine("  addi x19, x19, 3");
                sb.AppendLine("  addi x20, x20, 4");
                sb.AppendLine("  addi x22, x21, 5");

                sb.AppendLine("  ld x5, (x11)");
                sb.AppendLine("  ld x6, 64(x11)");

                if (initInstrs != null) sb.AppendLine(initInstrs);
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ld x5, (x5)");
                if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
                int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
                for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    sb.AppendLine(fillerInstrs1[addIdx]);
                    addIdx = (addIdx + 1) % fillerInstrs1.Length;
                }
                sb.AppendLine("  ld x6, (x6)");
                if (fence) sb.AppendLine("  fence");
                else
                {
                    if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
                    for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                    {
                        sb.AppendLine(fillerInstrs2[addIdx]);
                        addIdx = (addIdx + 1) % fillerInstrs2.Length;
                    }
                }

                sb.AppendLine("  addi x10, x10, -1");
                sb.AppendLine("  bge x10, x0, " + funcName + "start");

                sb.AppendLine("  ld x18, 0(sp)");
                sb.AppendLine("  ld x19, 8(sp)");
                sb.AppendLine("  ld x20, 16(sp)");
                sb.AppendLine("  ld x21, 24(sp)");
                sb.AppendLine("  ld x22, 32(sp)");
                sb.AppendLine("  ld x23, 40(sp)");
                sb.AppendLine("  ld x24, 48(sp)");
                sb.AppendLine("  ld x25, 56(sp)");
                sb.AppendLine("  ld x26, 64(sp)");
                sb.AppendLine("  ld x27, 72(sp)");
                sb.AppendLine("  addi sp, sp, 88");
                sb.AppendLine(" ret");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/A73RobTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    /// <summary>
    /// Looking for reordering capacity limits on A73 by combining several different instruction types
    /// </summary>
    public class A73RobTest : UarchTest
    {
        public A73RobTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "a73rob";
            this.Description = "Mixed integer/vec128 + stores";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix);
                string initInstrs = "  ldr q0, [x1]\n" +
                "  ldr q1, [x1, #0x10]\n" +
                "  ldr q2, [x1, #0x20]\n" +
                "  ldr q3, [x1, #0x30]\n" +
                "  ldr q4, [x1, #0x40]\n";

                List<string> fillerInstrs = new List<string>();
                for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++)
                {
                    if (i < 33) fillerInstrs.Add("  add v1.4s, v1.4s, v0.4s");
                    else if (i < 66) fillerInstrs.Add("  add x15, x15, x11");
                    else fillerInstrs.Add("  str x12, [x2]");
                }

                string[] fillerInstrsArr = fillerInstrs.ToArray();
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AddLoopTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AddLoopTest : UarchTest
    {
        /// <summary>
        ///
        /// </summary>
        /// <param name="low">must be greater than 2</param>
        /// <param name="high"></param>
        /// <param name="step"></param>
        public AddLoopTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "addloop";
            this.Description = $"ADD throughput for various loop sizes. Avoids NOP fusing";
            this.FunctionDefinitionParameters = "uint64_t iterations";
            this.GetFunctionCallParameters = "structIterations";
            this.DivideTimeByCount = true;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return false;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  add %r11, %r15";
            unrolledAdds[1] = "  add %r11, %r14";
            unrolledAdds[2] = "  add %r11, %r13";
            unrolledAdds[3] = "  add %r11, %r12";

            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = this.Prefix + this.Counts[i];
                sb.AppendLine(funcName + ":");

                // count dec, jnz as instructions in the loop
                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);
                sb.AppendLine("  dec %rdi");
                sb.AppendLine("  jnz " + funcName);
                sb.AppendLine("  ret");
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  add x15, x15, x11";
            unrolledAdds[1] = "  add x14, x14, x11";
            unrolledAdds[2] = "  add x13, x13, x11";
            unrolledAdds[3] = "  add x12, x12, x11";

            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = this.Prefix + this.Counts[i];
                sb.AppendLine(funcName + ":");

                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);
                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName);
                sb.AppendLine("  ret");
            }
        }
    }
}

================================================
FILE: AsmGen/tests/AddNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AddNsq : UarchTest
    {
        private int totalOps;
        public AddNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "addnsq" + totalOps;
            this.Description = "Integer adds, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            // if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] depInstrs = new string[2];
                depInstrs[0] = "  add %rdi, %r15";
                depInstrs[1] = "  add %rdi, %r14";

                string[] indepInstrs = new string[2];
                indepInstrs[0] = "  add %r13, %r11";
                indepInstrs[1] = "  add %r12, %r11";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AddSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AddSchedTest : UarchTest
    {
        public AddSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "addsched";
            this.Description = "Scheduler, Integer Adds";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add %rdi, %r15";
                unrolledAdds[1] = "  add %rdi, %r14";
                unrolledAdds[2] = "  add %rdi, %r13";
                unrolledAdds[3] = "  add %rdi, %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x15, x15, x25";
                unrolledAdds[1] = "  add x14, x14, x25";
                unrolledAdds[2] = "  add x13, x13, x25";
                unrolledAdds[3] = "  add x12, x12, x25";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add.d $r15, $r15, $r12";
                unrolledAdds[1] = "  add.d $r16, $r16, $r12";
                unrolledAdds[2] = "  add.d $r17, $r17, $r12";
                unrolledAdds[3] = "  add.d $r18, $r18, $r12";

                string[] unrolledAdds1 = new string[4];
                unrolledAdds1[0] = "  add.d $r15, $r15, $r13";
                unrolledAdds1[1] = "  add.d $r16, $r16, $r13";
                unrolledAdds1[2] = "  add.d $r17, $r17, $r13";
                unrolledAdds1[3] = "  add.d $r18, $r18, $r13";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x30, x30, x5";
                unrolledAdds[1] = "  add x29, x29, x5";
                unrolledAdds[2] = "  add x28, x28, x5";
                unrolledAdds[3] = "  add x31, x31, x5";

                string[] unrolledAdds1 = new string[4];
                unrolledAdds1[0] = "  add x30, x30, x6";
                unrolledAdds1[1] = "  add x31, x31, x6";
                unrolledAdds1[2] = "  add x28, x28, x6";
                unrolledAdds1[3] = "  add x29, x29, x6";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AddvNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AddvNsq : UarchTest
    {
        private int totalOps;
        public AddvNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "addvnsq";
            this.Description = "ADDV, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr d16, [x2, w25, sxtw #0]";
                string initInstrs = "  ldr d15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  addv h1, v16.4h";
                depInstrs[1] = "  addv h2, v16.4h";
                depInstrs[2] = "  addv h3, v16.4h";
                depInstrs[3] = "  addv h4, v16.4h";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  addv h1, v15.4h";
                indepInstrs[1] = "  addv h2, v15.4h";
                indepInstrs[2] = "  addv h3, v15.4h";
                indepInstrs[3] = "  addv h4, v15.4h";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AddvSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AddvSched : UarchTest
    {
        public AddvSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "addvsched";
            this.Description = "ADDV Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr q16, [x2, w25, sxtw #0]";
                string postLoadInstrs2 = "  ldr q16, [x2, w25, sxtw #0]";
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  addv h1, v16.4h";
                unrolledInstrs[1] = "  addv h2, v16.4h";
                unrolledInstrs[2] = "  addv h3, v16.4h";
                unrolledInstrs[3] = "  addv h4, v16.4h";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AeseSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AeseSchedTest : UarchTest
    {
        public AeseSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "aesesched";
            this.Description = "aese scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  aesenc %xmm0, %xmm1";
                unrolledAdds[1] = "  aesenc %xmm0, %xmm2";
                unrolledAdds[2] = "  aesenc %xmm0, %xmm3";
                unrolledAdds[3] = "  aesenc %xmm0, %xmm4";

                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }

            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr q0, [x2, w25, uxtw#0]";
                string postLoadInstrs2 = "  ldr q0, [x2, w26, uxtw#0]";
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  aese v1.16b, v0.16b";
                unrolledAdds[1] = "  aese v2.16b, v0.16b";
                unrolledAdds[2] = "  aese v3.16b, v0.16b";
                unrolledAdds[3] = "  aese v4.16b, v0.16b";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/AesencNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class AesencNsq : UarchTest
    {
        private int totalOps;
        public AesencNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "aesencnsq" + totalOps;
            this.Description = "AESENC, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            // if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string postLoadInstrs = "  mov %rdi, %r15\n  add %r8, %r15\n  movdqu (%r15), %xmm1";
                string initInstrs = "  movdqu (%r8), %xmm2";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  aesenc %xmm1, %xmm0";
                depInstrs[1] = "  aesenc %xmm1, %xmm3";
                depInstrs[2] = "  aesenc %xmm1, %xmm4";
                depInstrs[3] = "  aesenc %xmm1, %xmm5";

                string[] indepInstrs = new string[2];
                indepInstrs[0] = "  aesenc %xmm2, %xmm6";
                indepInstrs[1] = "  aesenc %xmm2, %xmm7";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr s16, [x2, w25, uxtw #2]";
                string initInstrs = "  ldr s15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  fadd s0, s0, s16";
                depInstrs[1] = "  fadd s1, s1, s16";
                depInstrs[2] = "  fadd s2, s2, s16";
                depInstrs[3] = "  fadd s3, s3, s16";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  fadd s17, s17, s15";
                indepInstrs[1] = "  fadd s18, s18, s15";
                indepInstrs[2] = "  fadd s19, s19, s15";
                indepInstrs[3] = "  fadd s20, s20, s15";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/BranchBufferTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class BranchBufferTest : UarchTest
    {
        private bool mixNops;
        private bool initialDependentBranch;
        public BranchBufferTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "bob" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Branch Order Buffer Test (not-taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ;
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.mixNops = mixNops;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86GccAsm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                GenerateMipsAsm(sb);
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = Prefix + Counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x2, %r13");
                sb.AppendLine("  mov $0x3, %r12");
                sb.AppendLine("  mov $0x4, %r11");

                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_edi_target{fillerIdx}";
                    sb.AppendLine($"  cmp %r14, %r11");
                    sb.AppendLine($"  je {jumpLabel}");
                    // try to space the jumps out a bit
                    if (this.mixNops) sb.AppendLine($"  nop");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_esi_target{fillerIdx}";
                    sb.AppendLine($"  cmp %r14, %r11");
                    sb.AppendLine($"  je {jumpLabel}");
                    if (this.mixNops) sb.AppendLine($"  nop");
                    // try to space the jumps out a bit
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = Prefix + Counts[i];

                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                sb.AppendLine("  mov x10, 6");

                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
                    sb.AppendLine($"  cmp x15, x10");
                    sb.AppendLine($"  b.eq {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
                    sb.AppendLine($"  cmp x15, x10");
                    sb.AppendLine($"  b.eq {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }

        public void GenerateMipsAsm(StringBuilder sb)
        {
            StringBuilder ntJumpTargets = new StringBuilder();
            for (int i = 0; i < Counts.Length; i++)
            {
                string initInstrs = "  move $r15, $r0\n  addi.d $r15, $r15, 15";
                string funcName = this.Prefix + Counts[i];

                // args in r4 = iterations, r5 = list, r6 = list (sink)
                // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  ld.d $r12, $r5, 0");
                sb.AppendLine("  ld.d $r13, $r5, 64");
                sb.AppendLine("  xor $r14, $r14, $r14");
                sb.AppendLine("  addi.d $r14, $r14, 1");
                sb.AppendLine(initInstrs);
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ld.d $r12, $r12, 0");
                int fillerInstrCount = Counts[i];
                for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    string jumpLabel = "dontenduphere_r12_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx;
                    sb.AppendLine($"  beqz $r15, {jumpLabel}");
                    ntJumpTargets.AppendLine(jumpLabel + ":");
                    ntJumpTargets.AppendLine("  jr $r1");
                }
                sb.AppendLine("  ld.d $r13, $r13, 0");
                for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
                {
                    string jumpLabel = "dontenduphere_r13_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx;
                    sb.AppendLine($"  beqz $r15, {jumpLabel}");
                    ntJumpTargets.AppendLine(jumpLabel + ":");
                    ntJumpTargets.AppendLine("  jr $r1");
                }
                sb.AppendLine("  sub.d $r4, $r4, $r14");
                sb.AppendLine("  bnez $r4, " + funcName + "start");
                sb.AppendLine(" jr $r1");
            }

            sb.AppendLine(ntJumpTargets.ToString());
        }
    }
}


================================================
FILE: AsmGen/tests/BranchHistoryTest.cs
================================================
﻿using System.IO;
using System.Text;

namespace AsmGen
{
    public class BranchHistoryTest : IUarchTest
    {
        public string Prefix { get; private set; }

        public string Description { get; private set; }

        public string FunctionDefinitionParameters { get; private set; }

        public string GetFunctionCallParameters { get; private set; }

        public bool DivideTimeByCount { get; private set; }

        private int[] branchCounts;
        private int[] historyCounts;

        public BranchHistoryTest()
        {
            Prefix = "branchhist";
            Description = "Branch predictor pattern recognition";
            FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen";
            GetFunctionCallParameters = "structIterations";
            DivideTimeByCount = true;
            branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 };
            historyCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536,
              2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768 };
        }

        public bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
            if (isa == IUarchTest.ISA.mips64) GenerateMipsAsm(sb);
            if (isa == IUarchTest.ISA.riscv) GenerateRiscvAsm(sb);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            for (int i = 0; i < branchCounts.Length; i++)
            {
                string functionLabel = Prefix + branchCounts[i];
                string loopLabel = functionLabel + "_loop";
                sb.AppendLine("\n" + functionLabel + ":");
                sb.AppendLine("  sub sp, sp, #0x40");
                sb.AppendLine("  stp x11, x12, [sp, #0x30]");
                sb.AppendLine("  stp x15, x16, [sp, #0x20]");
                sb.AppendLine("  stp x13, x14, [sp, #0x10]");
                sb.AppendLine("  eor x16, x16, x16");
                sb.AppendLine("  eor x15, x15, x15");
                sb.AppendLine("  eor x12, x12, x12");
                sb.AppendLine("  eor x11, x11, x11");

                // w14 = branch index, w16 = pattern array index
                sb.AppendLine(loopLabel + ":");
                sb.AppendLine("  eor w14, w14, w14");

                // generate branch blocks
                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
                {
                    string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;
                    sb.AppendLine("  ldr x15, [x1, w14, uxtw #3]");
                    sb.AppendLine("  add w14, w14, 1");
                    sb.AppendLine("  ldr w13, [x15, w16, uxtw #2]");
                    sb.AppendLine($"  cbnz x13, {jumpTarget}");
                    sb.AppendLine("  add x12, x12, 1");
                    sb.AppendLine(jumpTarget + ":");
                }

                // increment w16, and basically cmov 0 -> w16 if w16 = list length
                sb.AppendLine("  add w16, w16, 1");
                sb.AppendLine("  cmp w16, w2");
                sb.AppendLine("  csel w16, w11, w16, EQ");
                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine($"  cbnz x0, {loopLabel}");
                sb.AppendLine("  mov x0, x12");
                sb.AppendLine("  ldp x11, x12, [sp, #0x30]");
                sb.AppendLine("  ldp x15, x16, [sp, #0x20]");
                sb.AppendLine("  ldp x13, x14, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x40");
                sb.AppendLine("  ret");
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int i = 0; i < branchCounts.Length; i++)
            {
                string functionLabel = Prefix + branchCounts[i];
                sb.AppendLine("\n" + functionLabel + ":");
                sb.AppendLine("  push %rbx");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %r9");
                sb.AppendLine("  xor %rbx, %rbx");
                sb.AppendLine("  xor %r8, %r8");
                sb.AppendLine("  xor %r9, %r9");

                string loopLabel = functionLabel + "_loop";
                sb.AppendLine("\n" + loopLabel + ":");
                sb.AppendLine("  xor %r11, %r11"); // set index into arr of arrs to 0
                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
                {
                    sb.AppendLine("  mov (%rsi,%r11,8), %r10");  // load array base pointer into r10
                    sb.AppendLine("  inc %r11");
                    sb.AppendLine("  mov (%r10,%rbx,4), %eax "); // read element from branch history test array
                    sb.AppendLine("  test %eax, %eax");

                    // conditional branch on test array value
                    string zeroLabel = Prefix + branchCounts[i] + "_zero" + branchCount;
                    sb.AppendLine("  jz " + zeroLabel);
                    sb.AppendLine("  inc %r8"); // r8 is just a sink here
                    sb.AppendLine(zeroLabel + ":");
                }

                // loop around in pattern history test array if necessary
                // avoiding an extra branch to not pollute BPU history
                sb.AppendLine("  inc %rbx");
                sb.AppendLine("  cmp %rbx, %rdx");
                sb.AppendLine("  cmove %r9, %rbx");

                // end of main loop over iteration count
                sb.AppendLine("  dec %rdi");
                sb.AppendLine("  jnz " + loopLabel);

                // function epilogue
                sb.AppendLine("  mov %r8, %rax");
                sb.AppendLine("  pop %r9");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %rbx");
                sb.AppendLine("  ret");
            }
        }

        public void GenerateMipsAsm(StringBuilder sb)
        {
            // Generate an array of branch history test functions, one for each branch count
            for (int i = 0; i < branchCounts.Length; i++)
            {
                // branchtestFunc(iterations, testArrToArr, historyLen)
                // r4 = iterations, r5 = array of pointers to pattern arrays for each branch, r6 = history length (length of each array)
                // temporary registers: r12-r20

                // write code here
                string functionLabel = Prefix + branchCounts[i];
                sb.AppendLine("\n" + functionLabel + ":");

                // r12 = branch index, r13 = index into pattern array
                sb.AppendLine("  move $r13, $r0");
                sb.AppendLine("  move $r18, $r0");
                sb.AppendLine("  move $r20, $r0");
                sb.AppendLine("  addi.d $r20, $r20, 1");

                string loopLabel = functionLabel + "_loop";
                sb.AppendLine("\n" + loopLabel + ":");
                sb.AppendLine("  move $r12, $r0"); // set branch index to zero

                // generate branch blocks
                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
                {
                    string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;

                    // load the branch's pattern array
                    sb.AppendLine("  alsl.d $r14, $r12, $r0, 0x3");    // get offset into array in bytes, using r12 as array index.
                    sb.AppendLine("  add.d $r14, $r14, $r5");          // get address into r14
                    sb.AppendLine("  ld.d $r15, $r14, 0");             // r15 = base address of curent branch's target array
                    sb.AppendLine("  addi.d $r12, $r12, 1");            // next branch

                    // load element from pattern array indicating where we should branch
                    sb.AppendLine("  alsl.d $r16, $r13, $r0, 0x2");    // use r13 to index into pattern array
                    sb.AppendLine("  add.d $r16, $r16, $r15");         // r16 = address of element we want to load
                    sb.AppendLine("  ld.w $r17, $r16, 0");
                    sb.AppendLine($"  bnez $r17, {jumpTarget}");       // branch if 1
                    sb.AppendLine("  addi.d $r18, $r18, 1");
                    sb.AppendLine(jumpTarget + ":");
                }

                // increment w16, and basically cmov 0 -> w16 if w16 = list length
                // increment r13 (idx into pattern array)
                sb.AppendLine("  addi.d $r13, $r13, 1");
                sb.AppendLine("  sub.d $r19, $r6, $r13");    // r19 = history length - index
                sb.AppendLine("  maskeqz $r13, $r13, $r19"); // set index back to 0 to repeat pattern, if history length - index == 0
                sb.AppendLine("  sub.d $r4, $r4, $r20");            // decrement iteration count
                sb.AppendLine($"  bnez $r4, {loopLabel}");
                sb.AppendLine("  move $r4, $r18"); // return the count of NT branches for tracking RNG quality

                sb.AppendLine("  jr $r1");
            }
        }

        public void GenerateRiscvAsm(StringBuilder sb)
        {
            // Generate an array of branch history test functions, one for each branch count
            for (int i = 0; i < branchCounts.Length; i++)
            {
                // branchtestFunc(iterations, testArrToArr, historyLen)
                // a0 = iterations, a1 = array of pointers to pattern arrays for each branch, a2 = length of each array (history length)
                // t0-t7 temporary registers

                // write code here
                string functionLabel = Prefix + branchCounts[i];
                sb.AppendLine("\n" + functionLabel + ":");
                sb.AppendLine("  addi sp, sp, -16");
                sb.AppendLine("  sd s0, (sp)");
                // t1 = index into pattern array
                sb.AppendLine("  li t1, 0");
                sb.AppendLine("  li t6, 0");

                string loopLabel = functionLabel + "_loop";
                sb.AppendLine("\n" + loopLabel + ":");
                sb.AppendLine("  mv t2, a1"); // start of array of pointers to pattern arrays

                // generate branchCount blocks, each of which traverses its own array
                for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
                {
                    string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;

                    // load the branch's pattern array (a1 -> ptr -> array)
                    sb.AppendLine("  ld t3, (t2)");            // load pointer to array

                    // t3 = base address of branch's array
                    sb.AppendLine("  slli t4, t1, 2");
                    sb.AppendLine("  add t4, t4, t3");
                    sb.AppendLine("  lw t5, (t4)");    // should have 1 or 0
                    sb.AppendLine("  addi t2, t2, 8"); // next branch
                    sb.AppendLine($"  beq t5, x0, {jumpTarget}");
                    sb.AppendLine("  addi t6, t6, 1");  // dummy increment to track not-taken/taken branch ratio
                    sb.AppendLine(jumpTarget + ":");
                }

                sb.AppendLine("  addi t1, t1, 1"); // increment array index
                sb.AppendLine("  slt s0, t1, a2"); // 1 if within range
                sb.AppendLine("  mul t1, t1, s0"); // multiply by 1 if within range, 0 otherwise

                // decrement iteration count
                sb.AppendLine("  addi a0, a0, -1");
                sb.AppendLine($"  bne a0, x0, {loopLabel}");
                sb.AppendLine("  mv a0, t6");
                sb.AppendLine("  ld s0, (sp)");
                sb.AppendLine("  addi sp, sp, 16");
                sb.AppendLine("  ret");
            }
        }

        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
        {
            sb.AppendLine("  if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {");
            sb.AppendLine("    printf(\"" + Description + ":\\n\");");
            GenerateCommonTestBlock(sb);
            sb.AppendLine("  }\n");
        }

        public void GenerateAsmGlobalLines(StringBuilder sb)
        {
            for (int i = 0; i < branchCounts.Length; i++)
                sb.AppendLine(".global " + Prefix + branchCounts[i]);
        }

        // kinda hack this to put in initialization code we need
        public void GenerateExternLines(StringBuilder sb)
        {
            for (int i = 0; i < branchCounts.Length; i++)
                sb.AppendLine("extern uint64_t " + Prefix + branchCounts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");

            GenerateInitializationCode(sb, true);

            string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccBranchHistFunction.c"));
            sb.AppendLine(gccFunction);
        }

        public void GenerateInitializationCode(StringBuilder sb, bool gcc)
        {
            sb.AppendLine($"uint32_t maxBranchCount = {branchCounts.Length};");
            sb.Append($"uint32_t branchCounts[{branchCounts.Length}] = ");
            sb.Append("{  " + branchCounts[0]);
            for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]);
            sb.AppendLine(" };");
            sb.Append($"uint32_t branchHistoryLengths[{historyCounts.Length}] = ");
            sb.Append("{  " + historyCounts[0]);
            for (int i = 1; i < historyCounts.Length; i++) sb.Append(", " + historyCounts[i]);
            sb.AppendLine(" };");

            if (gcc) sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);");
            else sb.AppendLine($"uint64_t (*branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);");

            sb.AppendLine("void initializeBranchHistFuncArr() {");
            for (int i = 0; i < branchCounts.Length; i++)
            {
                sb.AppendLine($"  branchtestFuncArr[{i}] = {Prefix + branchCounts[i]};");
            }

            sb.AppendLine("}");
        }

        public void GenerateCommonTestBlock(StringBuilder sb)
        {
            string branchhistMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "BranchhistTestBlock.c"));
            sb.AppendLine(branchhistMain);
        }
    }
}


================================================
FILE: AsmGen/tests/BtbTest.cs
================================================
﻿using System;
using System.Text;

namespace AsmGen
{
    public class BtbTest : UarchTest
    {
        private int spacing;
        private BranchType branchType;
        private bool varyspacing;

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public enum BranchType
        {
            /// <summary>
            /// Conditional branches that are always taken
            /// </summary>
            Conditional,

            /// <summary>
            /// Unconditional jmps
            /// </summary>
            Unconditional,

            /// <summary>
            /// A mix of both to max out Zen 2's BTB capacity
            /// Optimization guide says one entry can track two branches if they're in the same 64B line
            /// and the first is conditional
            /// </summary>
            ZenMix
        }

        /// <summary>
        /// Constructor for BTB test
        /// </summary>
        /// <param name="spacing">How far apart branches should be. Valid values are 4, 8, 16</param>
        /// <param name="conditional">If true, use conditional branches (still always taken)</param>
        public BtbTest(int spacing, BranchType branchType, bool varyspacing = false)
        {
            this.Counts = new int[] { 1, 2, 4, 8, 16, 32, 48, 56, 64, 128, 256, 512, 768, 1024, 1536, 2048,
                3072, 4096, 4608, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, 28672, 32768, 40960, 49152 };
            this.Prefix = "btb" + spacing + (varyspacing ? "v" : "") + branchType;
            this.Description = $"Branch Target Buffer, " + branchType + $" branch every {spacing} bytes " + (varyspacing ? " (varied spacing)" : "");
            this.FunctionDefinitionParameters = "uint64_t iterations";
            this.GetFunctionCallParameters = "structIterations";
            this.DivideTimeByCount = true;
            this.spacing = spacing;
            this.branchType = branchType;
            this.varyspacing = varyspacing;
        }

        private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }
        public string GetLabelName(string funcName, int part) { return funcName + "part" + part; }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86GccAsm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                GenerateMipsAsm(sb);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                GenerateRiscvAsm(sb);
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            string paddingAlign = "  .align " + spacing;
            int spacingNops = 0;
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = GetBranchFuncName(Counts[i]);
                //sb.AppendLine("; Start of function for branch count " + branchCounts[i] + " padding " + paddings[p]);
                sb.AppendLine(funcName + ":\n");
                sb.AppendLine("  xor %rax, %rax");

                if (branchType == BranchType.ZenMix) sb.AppendLine("  .align 64");
                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
                {
                    string labelName = GetLabelName(funcName, branchIdx);

                    if (branchType == BranchType.Conditional)
                    {
                        sb.AppendLine("  test %rax, %rax");
                        sb.AppendLine("  jz " + labelName); // should always be set
                    }
                    else if (branchType == BranchType.Unconditional)
                    {
                        sb.AppendLine("  jmp " + labelName);
                    }
                    else if (branchType == BranchType.ZenMix)
                    {
                        if ((branchIdx & 0x1) == 0)
                        {
                            sb.AppendLine("  jmp " + labelName);
                        }
                        else
                        {
                            sb.AppendLine("  test %rax, %rax");
                            sb.AppendLine("  jz " + labelName);
                        }
                    }

                    sb.AppendLine(paddingAlign);

                    if (varyspacing)
                    {
                        for (int nopIdx = 0; nopIdx < spacingNops; nopIdx++)
                        {
                            sb.AppendLine("  nop");
                        }
                        spacingNops++;
                        if (spacingNops > 6) spacingNops = 0;
                    }

                    sb.AppendLine(labelName + ":");
                }

                sb.AppendLine("  dec %rdi");
                sb.AppendLine("  jne " + funcName);
                sb.AppendLine("  ret\n\n");

                // don't let it get too close to the next branch
                sb.AppendLine(paddingAlign);
            }
        }

        private string Get4BNopAlign()
        {
            string paddingAlign = "";
            if (spacing == 8)
            {
                paddingAlign = "  nop";
            }
            else if (spacing == 16)
            {
                paddingAlign = "  nop\n  nop\n  nop";
            }
            else if (spacing == 32)
            {
                paddingAlign = "  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop";
            }
            else if (spacing == 64)
            {
                paddingAlign = "  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n";
                paddingAlign += "  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop\n  nop";
            }
            else if (spacing != 4)
            {
                Console.WriteLine($"Unsupported padding value {spacing}");
                throw new NotImplementedException("Unsupported padding value");
            }

            return paddingAlign;
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            // things are 4 bytes on aarch64
            string paddingAlign = Get4BNopAlign();

            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = GetBranchFuncName(Counts[i]);
                string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";
                sb.AppendLine(funcName + ":");
                sb.AppendLine($"  adrp x2, {funcName}");
                sb.AppendLine($"  add x2, x2, :lo12:{funcName}");
                sb.AppendLine("  mov x1, 1");
                sb.AppendLine(".align 16");
                sb.AppendLine(funcTargetName + ":");
                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
                {
                    string labelName = GetLabelName(funcName, branchIdx);
                    if (branchType == BranchType.Unconditional)
                        sb.AppendLine("  b " + labelName);
                    else if (branchType == BranchType.Conditional)
                        sb.AppendLine("  cbnz x1, " + labelName); // x1 = 1 from earlier, should never be zero
                    else if (branchType == BranchType.ZenMix)
                    {
                        if ((branchIdx & 0x1) == 0) sb.AppendLine("  b " + labelName);
                        else sb.AppendLine("  cbnz x1, " + labelName);
                    }

                    sb.AppendLine(paddingAlign);
                    sb.AppendLine(labelName + ":");
                }

                sb.AppendLine(paddingAlign);
                sb.AppendLine("  sub x0, x0, 1");

                // aarch64 is a mess. try to avoid 'relocation truncated to fit' issues with an indirect branch
                if (spacing * Counts[i] >= (1024 * 1024 - 20))
                {
                    string workaroundTarget = funcName + "_aarch64_indirect_workaround";

                    // jump over indirect branch to return, on zero
                    // this branch should be not taken for all except the last iteration, and should have minimal
                    // impact on results because a predicted NT branch is sort of 'free' on most architectures
                    sb.AppendLine("  cbz x0, " + workaroundTarget);
                    sb.AppendLine("  br x2");
                    sb.AppendLine(workaroundTarget + ":");
                }
                else
                {
                    sb.AppendLine("  cbnz x0, " + funcTargetName);
                }

                sb.AppendLine("  ret\n\n");

                // don't let it get too close to the next branch
                sb.AppendLine(paddingAlign);
            }
        }

        public void GenerateMipsAsm(StringBuilder sb)
        {
            string paddingAlign = Get4BNopAlign();
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = GetBranchFuncName(Counts[i]);
                string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";

                sb.AppendLine(funcName + ":");
                sb.AppendLine("  xor $r12, $r12, $r12");
                sb.AppendLine("  addi.d $r12, $r12, 1");
                sb.AppendLine("  xor $r13, $r13, $r13");
                sb.AppendLine("  la $r14, " + funcTargetName);
                sb.AppendLine(funcTargetName + ":");
                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
                {
                    string labelName = GetLabelName(funcName, branchIdx);
                    sb.AppendLine("  beqz $r13, " + labelName);
                    sb.AppendLine(paddingAlign);
                    sb.AppendLine(labelName + ":");
                }

                sb.AppendLine("  sub.d $r4, $r4, $r12"); // decrement iteration count

                int distance = spacing * Counts[i];
                if (distance < 1024)
                {
                    sb.AppendLine("  bnez $r4, " + funcTargetName); // short branch if we're not too far away
                }
                else
                {
                    string workaroundTarget = funcName + "_mips_indirect_workaround";
                    sb.AppendLine("  beqz $r4, " + workaroundTarget); // jump over indirect branch if iteration count is reached
                    sb.AppendLine("  jr $r14"); // jump back to target (start of loop)
                    sb.AppendLine(workaroundTarget + ":");
                }

                sb.AppendLine("  jr $r1");
            }
        }

        private string GetRiscvNopAlign()
        {
            // branch takes 16 bits (2 bytes)
            int paddingNeeded = spacing - 2;

            // each NOP is 2 bytes
            StringBuilder nopSb = new StringBuilder();
            for (int i = 0; i < paddingNeeded; i += 2)
            {
                nopSb.AppendLine("  nop");
            }

            return nopSb.ToString();
        }

        public void GenerateRiscvAsm(StringBuilder sb)
        {
            string paddingAlign = GetRiscvNopAlign();
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = GetBranchFuncName(Counts[i]);
                string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";

                sb.AppendLine(funcName + ":");
                sb.AppendLine("  la x5, " + funcTargetName);
                sb.AppendLine(funcTargetName + ":");
                for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
                {
                    string labelName = GetLabelName(funcName, branchIdx);
                    sb.AppendLine("  j " + labelName);
                    sb.AppendLine(paddingAlign);
                    sb.AppendLine(labelName + ":");
                }

                sb.AppendLine("  addi x10, x10, -1"); // decrement iteration count

                int distance = spacing * Counts[i];
                if (distance < 1024)
                {
                    sb.AppendLine("  bne x10, x0, " + funcTargetName); // short branch if we're not too far away
                }
                else
                {
                    string workaroundTarget = funcName + "_riscv_indirect_workaround";
                    sb.AppendLine("  beq x10, x0, " + workaroundTarget); // jump over indirect branch if iteration count is reached
                    sb.AppendLine("  jalr x0, x5"); // jump back to target (start of loop)
                    sb.AppendLine(workaroundTarget + ":");
                }

                sb.AppendLine("  ret");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/CvtSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class CvtSchedTest : UarchTest
    {
        public CvtSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "cvtsched";
            this.Description = "F2I Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  cvtsi2ss %rdi, %xmm1";
                unrolledInstrs[1] = "  cvtsi2ss %rdi, %xmm2";
                unrolledInstrs[2] = "  cvtsi2ss %rdi, %xmm3";
                unrolledInstrs[3] = "  cvtsi2ss %rdi, %xmm4";

                string[] unrolledInstrs1 = new string[4];
                unrolledInstrs1[0] = "  cvtsi2ss %rsi, %xmm1";
                unrolledInstrs1[1] = "  cvtsi2ss %rsi, %xmm2";
                unrolledInstrs1[2] = "  cvtsi2ss %rsi, %xmm3";
                unrolledInstrs1[3] = "  cvtsi2ss %rsi, %xmm4";

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  scvtf s0, w25";
                unrolledInstrs[1] = "  scvtf s1, w25";
                unrolledInstrs[2] = "  scvtf s2, w25";
                unrolledInstrs[3] = "  scvtf s3, w25";

                string[] unrolledInstrs1 = new string[4];
                unrolledInstrs1[0] = "  scvtf s0, w26";
                unrolledInstrs1[1] = "  scvtf s1, w26";
                unrolledInstrs1[2] = "  scvtf s2, w26";
                unrolledInstrs1[3] = "  scvtf s3, w26";

                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FAdd256RfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class Fadd256RfTest : UarchTest
    {
        public enum TestMode
        {
            none,
            setavx512regs,
            pendingavx512instr
        }
        private bool populateAvx512Regs;
        private bool pendingAvx512Instr;
        public Fadd256RfTest(int low, int high, int step, TestMode mode)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fadd256rf" + mode;
            this.Description = "256-bit FP/vector RF capacity, " + mode;
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            if (mode == TestMode.setavx512regs) populateAvx512Regs = true;
            else if (mode == TestMode.pendingavx512instr) pendingAvx512Instr = true;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vmovups (%r8), %ymm0\n" +
                 "  vmovups %ymm0, %ymm1\n" +
                 "  vmovups %ymm0, %ymm2\n" +
                 "  vmovups %ymm0, %ymm3\n" +
                 "  vmovups %ymm0, %ymm4\n";

                if (this.populateAvx512Regs)
                {
                    for (int i = 5; i < 32; i++)
                    {
                        initInstrs += "  vmovups 64(%r8), %zmm" + i + "\n";
                    }
                }

                string postLoadInstr = string.Empty;

                if (this.pendingAvx512Instr)
                {
                    initInstrs += "  vmovups 64(%r8), %zmm5\n  vmovups 128(%r8), %zmm6\n";
                    postLoadInstr = "  vaddps %zmm5, %zmm6, %zmm6";
                }

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  vaddps %ymm0, %ymm1, %ymm1";
                unrolledAdds[1] = "  vaddps %ymm0, %ymm2, %ymm2";
                unrolledAdds[2] = "  vaddps %ymm0, %ymm3, %ymm3";
                unrolledAdds[3] = "  vaddps %ymm0, %ymm4, %ymm3";

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "";
                for (int regIdx = 0; regIdx < 32; regIdx++)
                {
                    initInstrs += "  xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
                }

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  xvfadd.s $xr1, $xr1, $xr1";
                unrolledAdds[1] = "  xvfadd.s $xr2, $xr2, $xr2";
                unrolledAdds[2] = "  xvfadd.s $xr3, $xr3, $xr3";
                unrolledAdds[3] = "  xvfadd.s $xr4, $xr4, $xr4";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Fadd128RfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class Fadd128RfTest : UarchTest
    {
        private bool initialDependentBranch;
        public Fadd128RfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fadd128rf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "128-bit FP/vector RF capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return false;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vmovups (%r8), %ymm0\n";

                for (int i = 1; i < 16; i++) initInstrs += $"  vmovups %ymm0, %ymm{i}\n";

                List<string> unrolledAddsList = new List<string>();
                for (int i = 1; i < 16; i++) unrolledAddsList.Add($"  vaddps %ymm0, %ymm{i}, %ymm{i}");
                string[] unrolledAdds = unrolledAddsList.ToArray();

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string initInstrs = "  ldr q0, [x1]\n" +
                "  ldr q1, [x1, #0x10]\n" +
                "  ldr q2, [x1, #0x20]\n" +
                "  ldr q3, [x1, #0x30]\n" +
                "  ldr q4, [x1, #0x40]\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add v1.4s, v1.4s, v0.4s";
                unrolledAdds[1] = "  add v2.4s, v2.4s, v0.4s";
                unrolledAdds[2] = "  add v3.4s, v3.4s, v0.4s";
                unrolledAdds[3] = "  add v4.4s, v4.4s, v0.4s";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string initInstrs = "  vsetvli t5, t6, e32\n  vlw.v v0, (a1)\n  vlw.v v1, (a1)\n  vlw.v v2, (a1)\n  vlw.v v3, (a1)";
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
                postLoadInstrs += "\n  mv t6, a2";
                string[] unrolledInstrs = new string[1];
                unrolledInstrs[0] = "  vfadd.vv v0, v0, v0";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,
                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Fadd128SchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class Fadd128SchedTest : UarchTest
    {
        public Fadd128SchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fadd128sched";
            this.Description = "128-bit Vector FP Add Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  addps %xmm0, %xmm1";
                unrolledAdds[1] = "  addps %xmm0, %xmm2";
                unrolledAdds[2] = "  addps %xmm0, %xmm3";
                unrolledAdds[3] = "  addps %xmm0, %xmm4";

                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr q0, [x2, w25, uxtw#0]";
                string postLoadInstrs2 = "  ldr q0, [x2, w26, uxtw#0]";
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add v1.4s, v1.4s, v0.4s";
                unrolledAdds[1] = "  add v2.4s, v2.4s, v0.4s";
                unrolledAdds[2] = "  add v3.4s, v3.4s, v0.4s";
                unrolledAdds[3] = "  add v4.4s, v4.4s, v0.4s";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Fadd256SchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class Fadd256SchedTest : UarchTest
    {
        public Fadd256SchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fadd256sched";
            this.Description = "256-bit FP add scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // ymm0 is dependent on ptr chasing load
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  vaddps %ymm0, %ymm1, %ymm1";
                unrolledAdds[1] = "  vaddps %ymm0, %ymm2, %ymm2";
                unrolledAdds[2] = "  vaddps %ymm0, %ymm3, %ymm3";
                unrolledAdds[3] = "  vaddps %ymm0, %ymm4, %ymm3";

                UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "";
                for (int regIdx = 0; regIdx < 32; regIdx++)
                {
                    initInstrs += "  xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
                }
                initInstrs += "  move $r16, $r0\n  addi.d $r16, $r16, 0xF"; // load mask into r16

                string postLoadInstrs1 = "  and $r15, $r12, $r16\n  xvldx $xr1, $r6, $r15";
                string postLoadInstrs2 = "  and $r15, $r13, $r16\n  xvldx $xr1, $r6, $r15";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  xvfadd.s $xr2, $xr2, $xr1";
                unrolledAdds[1] = "  xvfadd.s $xr3, $xr3, $xr1";
                unrolledAdds[2] = "  xvfadd.s $xr4, $xr4, $xr1";
                unrolledAdds[3] = "  xvfadd.s $xr5, $xr5, $xr1";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FaddNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FaddNsq : UarchTest
    {
        private int totalOps;
        public FaddNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "faddnsq" + totalOps;
            this.Description = "FADD, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string postLoadInstrs = "  cvtsi2ss %edi, %xmm1";
                string initInstrs = "  cvtsi2ss %r12, %xmm2";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  addss %xmm1, %xmm0";
                depInstrs[1] = "  addss %xmm1, %xmm3";
                depInstrs[2] = "  addss %xmm1, %xmm4";
                depInstrs[3] = "  addss %xmm1, %xmm5";

                string[] indepInstrs = new string[2];
                indepInstrs[0] = "  addss %xmm2, %xmm6";
                indepInstrs[1] = "  addss %xmm2, %xmm7";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr s16, [x2, w25, uxtw #2]";
                string initInstrs = "  ldr s15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  fadd s0, s0, s16";
                depInstrs[1] = "  fadd s1, s1, s16";
                depInstrs[2] = "  fadd s2, s2, s16";
                depInstrs[3] = "  fadd s3, s3, s16";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  fadd s17, s17, s15";
                indepInstrs[1] = "  fadd s18, s18, s15";
                indepInstrs[2] = "  fadd s19, s19, s15";
                indepInstrs[3] = "  fadd s20, s20, s15";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FaddSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FaddSchedTest : UarchTest
    {
        public FaddSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "faddsched";
            this.Description = "FP Add Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  addss %xmm0, %xmm1";
                unrolledAdds[1] = "  addss %xmm0, %xmm2";
                unrolledAdds[2] = "  addss %xmm0, %xmm3";
                unrolledAdds[3] = "  addss %xmm0, %xmm4";

                UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd s17, s17, s16";
                unrolledAdds[1] = "  fadd s18, s18, s16";
                unrolledAdds[2] = "  fadd s19, s19, s16";
                unrolledAdds[3] = "  fadd s20, s20, s16";
                UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "  fld.s $f8, $r6, 0\n" +
                    "  fld.s $f9, $r6, 4\n" +
                    "  fld.s $f10, $r6, 8\n" +
                    "  fld.s $f11, $r6, 12\n" +
                    "  fld.s $f12, $r6, 16\n";

                string postLoadInstrs1 = "  andi $r19, $r12, 0xF\n  add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0";
                string[] dependentAdds = new string[4];
                dependentAdds[0] = "  fadd.s $f9, $f9, $f8";
                dependentAdds[1] = "  fadd.s $f10, $f10, $f8";
                dependentAdds[2] = "  fadd.s $f11, $f11, $f8";
                dependentAdds[3] = "  fadd.s $f12, $f12, $f8";

                string postLoadInstrs2 = "  andi $r19, $r13, 0xF\n  add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, dependentAdds, dependentAdds, includePtrChasingLoads: false, initInstrs: initInstrs, 
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string initInstrs = "  fld f0, (x12)\n" +
                    "  fld f1, 8(x12)\n" +
                    "  fld f2, 16(x12)\n" +
                    "  fld f3, 24(x12)\n" +
                    "  fld f4, 32(x12)\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd.s f0, f0, f4";
                unrolledAdds[1] = "  fadd.s f1, f1, f4";
                unrolledAdds[2] = "  fadd.s f2, f2, f4";
                unrolledAdds[3] = "  fadd.s f3, f3, f4";

                string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12\n  fld f4, (x7)";
                string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12\n  fld f4, (x7)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,
                    initInstrs, postLoadInstrs1, postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FcmpSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FcmpSchedTest : UarchTest
    {
        public FcmpSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fcmpsched";
            this.Description = "FCMP Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fcmp s17, s16";
                unrolledAdds[1] = "  fcmp s19, s16";
                unrolledAdds[2] = "  fcmp s19, s16";
                unrolledAdds[3] = "  fcmp s20, s16";
                UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FlagRfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FlagRfTest : UarchTest
    {
        private bool initialDependentBranch;
        public FlagRfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "flagrf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Flags Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return false;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[1];
                unrolledAdds[0] = "  test %r15, %r14";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] unrolledAdds = new string[1];
                unrolledAdds[0] = "  cmp x14, x15";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Fma256SchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class Fma256SchedTest : UarchTest
    {
        public Fma256SchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fma256sched";
            this.Description = "256-bit FP add scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return false;
            if (isa == IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // ymm0 is dependent on ptr chasing load
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  vaddps %ymm0, %ymm1, %ymm1";
                unrolledAdds[1] = "  vaddps %ymm0, %ymm2, %ymm2";
                unrolledAdds[2] = "  vaddps %ymm0, %ymm3, %ymm3";
                unrolledAdds[3] = "  vaddps %ymm0, %ymm4, %ymm3";

                UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "";
                for (int regIdx = 0; regIdx < 32; regIdx++)
                {
                    initInstrs += "  xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
                }
                initInstrs += "  move $r16, $r0\n  addi.d $r16, $r16, 0xF"; // load mask into r16

                string postLoadInstrs1 = "  and $r15, $r12, $r16\n  xvldx $xr1, $r6, $r15";
                string postLoadInstrs2 = "  and $r15, $r13, $r16\n  xvldx $xr1, $r6, $r15";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  xvfmadd.s $xr2, $xr2, $xr2, $xr1";
                unrolledAdds[1] = "  xvfmadd.s $xr3, $xr3, $xr3, $xr1";
                unrolledAdds[2] = "  xvfmadd.s $xr4, $xr4, $xr4, $xr1";
                unrolledAdds[3] = "  xvfmadd.s $xr5, $xr5, $xr5, $xr1";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FmovSched.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FmovSched : UarchTest
    {
        public FmovSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fmovsched";
            this.Description = "FMOV vec to gpr Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr d16, [x2, w25, sxtw #0]";
                string postLoadInstrs2 = "  ldr d16, [x2, w25, sxtw #0]";
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  fmov x15, d16";
                unrolledInstrs[1] = "  fmov x14, d16";
                unrolledInstrs[2] = "  fmov x13, d16";
                unrolledInstrs[3] = "  fmov x12, d16";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FmulSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FmulSchedTest : UarchTest
    {
        public FmulSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fmulsched";
            this.Description = "FP (32-bit multiply) Scheduler Capacity Test";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86Asm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                GenerateRiscvAsm(sb);
            }
        }

        public void GenerateX86Asm(StringBuilder sb)
        {
            // xmm0 is dependent on ptr chasing load
            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  mulss %xmm0, %xmm1";
            unrolledAdds[1] = "  mulss %xmm0, %xmm2";
            unrolledAdds[2] = "  mulss %xmm0, %xmm3";
            unrolledAdds[3] = "  mulss %xmm0, %xmm4";

            UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  fmul s17, s17, s16";
            unrolledAdds[1] = "  fmul s18, s18, s16";
            unrolledAdds[2] = "  fmul s19, s19, s16";
            unrolledAdds[3] = "  fmul s20, s20, s16";
            UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
        }

        public void GenerateRiscvAsm(StringBuilder sb)
        {
            string initInstrs = "  fld f0, (x12)\n" +
                "  fld f1, 8(x12)\n" +
                "  fld f2, 16(x12)\n" +
                "  fld f3, 24(x12)\n" +
                "  fld f4, 32(x12)\n";

            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  fmul.s f0, f0, f4";
            unrolledAdds[1] = "  fmul.s f1, f1, f4";
            unrolledAdds[2] = "  fmul.s f2, f2, f4";
            unrolledAdds[3] = "  fmul.s f3, f3, f4";

            string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12\n  fld f4, (x7)";
            string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12\n  fld f4, (x7)";
            UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,
                initInstrs, postLoadInstrs1, postLoadInstrs2);
        }
    }
}

================================================
FILE: AsmGen/tests/FpRfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FpRfTest : UarchTest
    {
        private bool initialDependentBranch;
        public FpRfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fprf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  movss (%r8), %xmm1\n" +
                    "  movss 4(%r8), %xmm2\n" +
                    "  movss 8(%r8), %xmm3\n" +
                    "  movss 12(%r8), %xmm4\n" +
                    "  movss 16(%r8), %xmm5\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  addss %xmm1, %xmm2";
                unrolledAdds[1] = "  addss %xmm1, %xmm3";
                unrolledAdds[2] = "  addss %xmm1, %xmm4";
                unrolledAdds[3] = "  addss %xmm1, %xmm5";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string initInstrs = "  ldr s17, [x2]\n" +
                    "  ldr s18, [x2, 4]\n" +
                    "  ldr s19, [x2, 8]\n" +
                    "  ldr s20, [x2, 12]\n" +
                    "  ldr s21, [x2, 16]\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd s18, s18, s17";
                unrolledAdds[1] = "  fadd s19, s19, s17";
                unrolledAdds[2] = "  fadd s20, s20, s17";
                unrolledAdds[3] = "  fadd s21, s21, s17";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "  fld.s $f8, $r6, 0\n" +
                    "  fld.s $f9, $r6, 4\n" +
                    "  fld.s $f10, $r6, 8\n" +
                    "  fld.s $f11, $r6, 12\n" +
                    "  fld.s $f12, $r6, 16\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd.s $f9, $f9, $f8";
                unrolledAdds[1] = "  fadd.s $f10, $f10, $f8";
                unrolledAdds[2] = "  fadd.s $f11, $f11, $f8";
                unrolledAdds[3] = "  fadd.s $f12, $f12, $f8";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
                string initInstrs = "  fld f0, (x12)\n" +
                    "  fld f1, 8(x12)\n" +
                    "  fld f2, 16(x12)\n" +
                    "  fld f3, 24(x12)\n" +
                    "  fld f4, 32(x12)\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd.s f0, f0, f4";
                unrolledAdds[1] = "  fadd.s f1, f1, f4";
                unrolledAdds[2] = "  fadd.s f2, f2, f4";
                unrolledAdds[3] = "  fadd.s f3, f3, f4";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, 
                    includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/FpStoreDataNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class FpStoreDataNsqTest : UarchTest
    {
        public FpStoreDataNsqTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fpstoredatansq" + high;
            this.Description = "Store FP 32-bit data scheduler capacity, excluding nsq";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vzeroupper\n  vpcmpeqd %xmm2, %xmm2, %xmm2\n  vpxor %xmm2, %xmm3, %xmm3\n  cvtsi2ss %r11, %xmm3\n movss %xmm3, %xmm4\n  movss %xmm3, %xmm5\n  movss %xmm3, %xmm6";
                string postLoadInstr = "  cvtsi2ss %rdi, %xmm1";
                string[] dependentStores = new string[4];
                dependentStores[0] = "  movss %xmm1, (%r8)";
                dependentStores[1] = "  movss %xmm1, (%r8, %r14, 4)";
                dependentStores[2] = "  movss %xmm1, (%r8, %r13, 4)";
                dependentStores[3] = "  movss %xmm1, (%r8, %r12, 4)";

                string[] indepFpInstrs = new string[4];
                indepFpInstrs[0] = "  addss %xmm2, %xmm3";
                indepFpInstrs[1] = "  addss %xmm2, %xmm4";
                indepFpInstrs[2] = "  addss %xmm2, %xmm5";
                indepFpInstrs[3] = "  addss %xmm2, %xmm6";

                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/IdrfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class IdrfTest : UarchTest
    {
        public IdrfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "idrf";
            this.Description = "Immediate/Displacement Register File";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            const string dummyBranchTargetName = "idrftest_badtarget";
            if (isa == IUarchTest.ISA.amd64)
            {
                const int storeCount = 40;
                const int addCount = 130;
                List<string> testInstructions = new List<string>();
                int storeIdx = 0, addIdx = 0;
                for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
                {
                    if (addIdx < addCount)
                    {
                        string addInstr = "  add $" + (i + 1) + ", %r" + (12 + (i % 4));
                        testInstructions.Add(addInstr);
                        addIdx++;
                    }
                    else if (storeIdx < storeCount)
                    {
                        string storeInstr = "  mov %r11d, " + +(((i + 1) & 0xFF) * 4) + "(%r8)";
                        testInstructions.Add(storeInstr);
                        storeIdx++;
                    }
                    else
                    {
                        string branchInstr = $"  test %r11, %r11\n  je {dummyBranchTargetName}";
                        testInstructions.Add(branchInstr);
                    }
                }

                string[] unrolledAdds = testInstructions.ToArray();
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);

                sb.AppendLine($"{dummyBranchTargetName}:\n  int3");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/IndirectBranchTest.cs
================================================
﻿using System.Text;
using System.IO;

namespace AsmGen
{
    public class IndirectBranchTest : IUarchTest
    {
        private int[] branchCounts;
        private int[] targetCounts;
        private int globalHistoryAssistBits;
        private bool assists;

        public IndirectBranchTest(bool assist)
        {
            Prefix = "indirectbranch";
            Description = "Indirect branch prediction";
            FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch";
            DivideTimeByCount = true;
            branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
            targetCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 192, 256, 384, 512 };
            globalHistoryAssistBits = 4;
            this.assists = assist;
        }

        public bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86GccAsm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                GenerateMipsAsm(sb);
            }
        }

        private string GetFunctionName(int branchCount, int targetCount)
        {
            return Prefix + branchCount + "targets" + targetCount;
        }

        private string GetTargetLabelName(int branchCount, int targetCount, int branchIndex, int targetIndex)
        {
            return GetFunctionName(branchCount, targetCount) + "branch" + branchIndex + "target" + targetIndex;
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
            {
                int currentTargetCount = targetCounts[targetCountIdx];
                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
                {
                    int currentBranchCount = branchCounts[branchCountIdx];
                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
                    string loopLabel = functionLabel + "_loop";
                    sb.AppendLine("\n" + functionLabel + ":");
                    sb.AppendLine("  sub sp, sp, #0x60");
                    sb.AppendLine("  stp x17, x18, [sp, #0x40]");
                    sb.AppendLine("  stp x9, x10, [sp, #0x40]");
                    sb.AppendLine("  stp x11, x12, [sp, #0x30]");
                    sb.AppendLine("  stp x15, x16, [sp, #0x20]");
                    sb.AppendLine("  stp x13, x14, [sp, #0x10]");
                    sb.AppendLine("  eor x16, x16, x16");
                    sb.AppendLine("  eor x15, x15, x15");
                    sb.AppendLine("  eor x14, x14, x14");
                    sb.AppendLine("  eor x12, x12, x12");
                    sb.AppendLine("  eor x11, x11, x11");

                    // fill in jump tables for every branch. there has to be a better way to do this
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        // x3 = array of ptrs to jump tables
                        // x14 = index into array of jump tables
                        // x17 = ptr to jump table
                        sb.AppendLine("  ldr x17, [x3, w14, uxtw #3]");
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            // assuming 64-bit pointers and 4K page size
                            // use x16 = label index
                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
                            sb.AppendLine($"  adrp x10, {targetLabelName}");
                            sb.AppendLine($"  add x10, x10, :lo12:{targetLabelName}");
                            sb.AppendLine("  str x10, [x17, w16, uxtw #3]");
                            sb.AppendLine("  add w16, w16, 1");
                        }

                        sb.AppendLine("  eor x16, x16, x16");
                        sb.AppendLine("  add w14, w14, 1");
                    }

                    // w14 = branch index, w16 = pattern (target) array index
                    sb.AppendLine(loopLabel + ":");
                    sb.AppendLine("  eor w14, w14, w14");

                    // generate branch blocks
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        // get a pointer to the jump table
                        sb.AppendLine("  ldr x9, [x3, w14, uxtw #3]");

                        // look up which target to jump to
                        sb.AppendLine("  ldr x15, [x1, w14, uxtw #3]");
                        sb.AppendLine("  add w14, w14, 1");
                        sb.AppendLine("  ldr w13, [x15, w16, uxtw #2]");

                        // use the target index (w13) to index into the jump table, and branch on it
                        sb.AppendLine("  ldr x17, [x9, w13, uxtw #3]");

                        // global history assist branches
                        // rax = index into jump table. make that correlate with global history
                        if (this.assists)
                        {
                            sb.AppendLine("  mov x18, 1");
                            sb.AppendLine("  eor w12, w12, w12");
                            for (int eaxBits = 0; eaxBits < globalHistoryAssistBits; eaxBits++)
                            {
                                string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits;
                                sb.AppendLine("  and w12, w13, w18");
                                sb.AppendLine($"  cbnz w12, {targetName}");
                                sb.AppendLine("  nop");
                                sb.AppendLine($"{targetName}:");
                                sb.AppendLine("  lsl w18, w18, 1");
                            }
                        }

                        // branch on value of x17
                        sb.AppendLine($"  br x17");
                        sb.AppendLine("  nop");

                        // generate targets
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
                            sb.AppendLine($"  nop");
                        }
                    }

                    // increment w16, and basically cmov 0 -> w16 if w16 = list length
                    sb.AppendLine("  add w16, w16, 1");
                    sb.AppendLine("  cmp w16, w2");
                    sb.AppendLine("  csel w16, w11, w16, EQ");
                    sb.AppendLine("  sub x0, x0, 1");
                    sb.AppendLine($"  cbnz x0, {loopLabel}");
                    sb.AppendLine("  mov x0, x12");
                    sb.AppendLine("  ldp x9, x10, [sp, #0x40]");
                    sb.AppendLine("  ldp x11, x12, [sp, #0x30]");
                    sb.AppendLine("  ldp x15, x16, [sp, #0x20]");
                    sb.AppendLine("  ldp x13, x14, [sp, #0x10]");
                    sb.AppendLine("  ldp x17, x18, [sp, #0x40]");
                    sb.AppendLine("  add sp, sp, #0x60");
                    sb.AppendLine("  ret");
                }
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
            {
                int currentTargetCount = targetCounts[targetCountIdx];
                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
                {
                    /* rdi = iteration count
                     * rsi = array of target selection arrays, one for each branch
                     * rdx = length of pattern array
                     * rcx = array of jump tables, one for each branch
                     */
                    int currentBranchCount = branchCounts[branchCountIdx];
                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
                    sb.AppendLine("\n" + functionLabel + ":");
                    sb.AppendLine("  push %rbx");
                    sb.AppendLine("  push %r8");
                    sb.AppendLine("  push %r9");
                    sb.AppendLine("  push %r13");
                    sb.AppendLine("  push %r15");
                    sb.AppendLine("  push %r14");
                    sb.AppendLine("  xor %rbx, %rbx");
                    sb.AppendLine("  xor %r8, %r8");
                    sb.AppendLine("  xor %r9, %r9");

                    // initialize jump table
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        // rcx = array of ptrs to jump tables
                        // r9 = index into array of jump tables
                        // r15 = ptr to jump table

                        // load jump table base address into r15
                        sb.AppendLine("  mov (%rcx,%r9,8), %r15");
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            // assuming 64-bit pointers and 4K page size
                            // use rbx = index into
                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
                            sb.AppendLine($"  lea {targetLabelName}(%rip), %rax");
                            sb.AppendLine($"  mov %rax, (%r15,%rbx,8)");
                            sb.AppendLine("  inc %rbx");
                        }

                        sb.AppendLine("  xor %rbx, %rbx");
                        sb.AppendLine("  inc %r9");
                    }

                    sb.AppendLine("  xor %r8, %r8");
                    sb.AppendLine("  xor %r9, %r9");

                    string loopLabel = functionLabel + "_loop";
                    sb.AppendLine("\n" + loopLabel + ":");
                    sb.AppendLine("  xor %r11, %r11"); // set index into arr of arrs to 0
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        sb.AppendLine("  mov (%rcx,%r11,8), %r15");  // load jump table base pointer into r15
                        sb.AppendLine("  mov (%rsi,%r11,8), %r10");  // load target select array base pointer into r10
                        sb.AppendLine("  inc %r11");
                        sb.AppendLine("  mov (%r10,%rbx,4), %eax"); // get the target for the current iteration into eax
                        sb.AppendLine("  mov (%r15,%rax,8), %r14");  // load address of jump target from jump table

                        if (assists)
                        {
                            sb.AppendLine("  mov %rsi, %r13");
                            sb.AppendLine("  mov $1, %rsi");
                            for (int eaxBits = 0; eaxBits < 7; eaxBits++)
                            {
                                string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits;
                                sb.AppendLine("  test %eax, %esi");
                                sb.AppendLine($"  jnz {targetName}");
                                sb.AppendLine("  nop");
                                sb.AppendLine($"{targetName}:");

                                sb.AppendLine("  shl $1, %esi");
                            }
                        }

                        sb.AppendLine("  mov %r13, %rsi");

                        sb.AppendLine("  jmp *%r14");                // and jump to it
                        // generate targets
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
                            sb.AppendLine($"  nop");
                        }
                    }

                    // loop around in pattern history test array if necessary
                    // avoiding an extra branch to not pollute BPU history
                    sb.AppendLine("  inc %rbx");
                    sb.AppendLine("  cmp %rbx, %rdx");
                    sb.AppendLine("  cmove %r9, %rbx");

                    // end of main loop over iteration count
                    sb.AppendLine("  dec %rdi");
                    sb.AppendLine("  jnz " + loopLabel);

                    // function epilogue
                    sb.AppendLine("  mov %r8, %rax");
                    sb.AppendLine("  pop %r14");
                    sb.AppendLine("  pop %r15");
                    sb.AppendLine("  pop %r13");
                    sb.AppendLine("  pop %r9");
                    sb.AppendLine("  pop %r8");
                    sb.AppendLine("  pop %rbx");
                    sb.AppendLine("  ret");
                }
            }
        }

        public void GenerateMipsAsm(StringBuilder sb)
        {
            for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
            {
                int currentTargetCount = targetCounts[targetCountIdx];
                for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
                {
                    /* r4 = iteration count
                     * r5 = array of target selection arrays, one for each branch
                     * r6 = length of pattern array
                     * r7 = array of jump tables, one for each branch
                     */
                    int currentBranchCount = branchCounts[branchCountIdx];
                    string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
                    sb.AppendLine("\n" + functionLabel + ":");

                    // initialize jump tables. r12-r20 are temporary regs. 
                    sb.AppendLine("  move $r13, $r7"); // use r13 to access array of pointers to jump tables
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        sb.AppendLine("  ld.d $r15, $r13, 0");          // load address of branch's jump table into r15

                        // initialize the jump table. r15 = base addr. rely on C# for bounds :)
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            // write label addresses into array
                            string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
                            sb.AppendLine("  la $r16, " + targetLabelName); // load branch target address into r16
                            sb.AppendLine("  st.d $r16, $r15, 0");          // store branch target address
                            sb.AppendLine("  addi.d $r15, $r15, 8");        // increment array pointer
                        }

                        sb.AppendLine("  addi.d $r13, $r13, 8");    // increment array pointer for array of pointers to jump tables
                    }

                    // loop through branches for (iterations) times
                    string loopLabel = functionLabel + "_loop";
                    sb.AppendLine("  move $r14, $r0");    // r14 = branch target index
                    sb.AppendLine("  move $r17, $r0");
                    sb.AppendLine("  addi.d $r17, $r17, 1"); // use r17 just to store 1
                    sb.AppendLine("\n" + loopLabel + ":");
                    sb.AppendLine("  move $r12, $r5");      // r12 to hold pointer to target selection array
                    sb.AppendLine("  move $r13, $r7");      // r13 to hold pointer to jump target array
                    for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
                    {
                        sb.AppendLine("  ld.d $r16, $r12, 0"); // r16 = base address of target select array
                        sb.AppendLine("  ld.d $r18, $r13, 0"); // r18 = base address of jump target array

                        // target select array[target index]
                        sb.AppendLine("  alsl.d $r15, $r14, $r0, 0x2");
                        sb.AppendLine("  add.d $r15, $r15, $r16");
                        sb.AppendLine("  ld.w $r19, $r15, 0");          // load 32-bit target index

                        sb.AppendLine("  alsl.d $r15, $r19, $r0, 0x3"); // now index into jump table
                        sb.AppendLine("  add.d $r15, $r18, $r15");
                        sb.AppendLine("  ld.d $r20, $r15, 0");

                        // increment pointers for next branch
                        sb.AppendLine("  addi.d $r12, $r12, 8");
                        sb.AppendLine("  addi.d $r13, $r13, 8");
                        sb.AppendLine("  jr $r20");

                        // generate targets
                        for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
                        {
                            sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
                            sb.AppendLine($"  nop");
                        }
                    }

                    // loop back. and try to reset branch index without a branch
                    sb.AppendLine("  addi.d $r14, $r14, 1"); // if r14 == r6 (pattern array length), set r14 back to 0 somehow
                    sb.AppendLine("  sub.d $r12, $r14, $r6"); // 12 = temporary result of comparison
                    sb.AppendLine("  maskeqz $r14, $r14, $r12"); // if r12 = 0, set r14 to 0. otherwise use current value
                    sb.AppendLine("  sub.d $r4, $r4, $r17");
                    sb.AppendLine("  bnez $r4, " + loopLabel);
                    sb.AppendLine("  jr $r1");
                }
            }
        }

        // kinda hack this to put in initialization code we need
        public void GenerateExternLines(StringBuilder sb)
        {
            for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
                    sb.AppendLine("extern uint64_t " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]) + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");

            GenerateInitializationCode(sb);
            string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccIndirectBranchFunction.c"));
            sb.AppendLine(gccFunction);
        }

        public void GenerateInitializationCode(StringBuilder sb)
        {
            sb.AppendLine($"uint32_t maxIndirectBranchCount = {branchCounts.Length};");
            sb.Append($"uint32_t indirectBranchCounts[{branchCounts.Length}] = ");
            sb.Append("{  " + branchCounts[0]);
            for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]);
            sb.AppendLine(" };");
            sb.Append($"uint32_t indirectBranchTargetCounts[{targetCounts.Length}] = ");
            sb.Append("{  " + targetCounts[0]);
            for (int i = 1; i < targetCounts.Length; i++) sb.Append(", " + targetCounts[i]);
            sb.AppendLine(" };");

            // TODO: need to make this a 2D array - [branch count][target count]
            sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *indirectBranchTestFuncArr[{branchCounts.Length}][{targetCounts.Length}])({FunctionDefinitionParameters});");

            sb.AppendLine("void initializeIndirectBranchFuncArr() {");
            for (int i = 0; i < branchCounts.Length; i++)
            {
                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
                {
                    sb.AppendLine($"  indirectBranchTestFuncArr[{i}][{targetCountIdx}] = {GetFunctionName(branchCounts[i], targetCounts[targetCountIdx])};");
                }
            }

            sb.AppendLine("}");
        }

        public string Prefix { get; set; }
        public string Description { get; set; }
        public int[] Counts;
        public string FunctionDefinitionParameters { get; set; }
        public string GetFunctionCallParameters { get; set; }
        public bool DivideTimeByCount { get; set; }
        public void GenerateAsmGlobalLines(StringBuilder sb)
        {
            for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
                for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
                    sb.AppendLine(".global " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]));
        }

        public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
        {
            sb.AppendLine("  if (argc > 1 && strncmp(test_name, \"" + Prefix + "\", " + Prefix.Length + ") == 0) {");
            sb.AppendLine("    printf(\"" + Description + ":\\n\");");
            string ibMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "IndirectBranchTestBlock.c"));
            sb.AppendLine(ibMain);
            sb.AppendLine("  }\n");
        }
    }
}

================================================
FILE: AsmGen/tests/IntRfDepStoreTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class IntRfTestDependentStore : UarchTest
    {
        public IntRfTestDependentStore(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "intrfds";
            this.Description = "Integer Register File, preceded by a dependent store";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add %r11, %r15";
                unrolledAdds[1] = "  add %r11, %r14";
                unrolledAdds[2] = "  add %r11, %r13";
                unrolledAdds[3] = "  add %r11, %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "str w15, [x2, w25, uxtw #2]";
                string postLoadInstrs2 = "str w15, [x2, w26, uxtw #2]";
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x15, x15, x11";
                unrolledAdds[1] = "  add x14, x14, x11";
                unrolledAdds[2] = "  add x13, x13, x11";
                unrolledAdds[3] = "  add x12, x12, x11";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add.d $r15, $r15, $r14";
                unrolledAdds[1] = "  add.d $r16, $r16, $r14";
                unrolledAdds[2] = "  add.d $r17, $r17, $r14";
                unrolledAdds[3] = "  add.d $r18, $r18, $r14";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x28, x28, x29";
                unrolledAdds[1] = "  add x30, x30, x29";
                unrolledAdds[2] = "  add x31, x31, x29";
                unrolledAdds[3] = "  add x18, x18, x29";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/IntRfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class IntRfTest : UarchTest
    {
        private bool initialDependentBranch;
        public IntRfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "intrf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Integer Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add %r11, %r15";
                unrolledAdds[1] = "  add %r11, %r14";
                unrolledAdds[2] = "  add %r11, %r13";
                unrolledAdds[3] = "  add %r11, %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x15, x15, x11";
                unrolledAdds[1] = "  add x14, x14, x11";
                unrolledAdds[2] = "  add x13, x13, x11";
                unrolledAdds[3] = "  add x12, x12, x11";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add.d $r15, $r15, $r14";
                unrolledAdds[1] = "  add.d $r16, $r16, $r14";
                unrolledAdds[2] = "  add.d $r17, $r17, $r14";
                unrolledAdds[3] = "  add.d $r18, $r18, $r14";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add x28, x28, x29";
                unrolledAdds[1] = "  add x30, x30, x29";
                unrolledAdds[2] = "  add x31, x31, x29";
                unrolledAdds[3] = "  add x18, x18, x29";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/JsCvtNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class JsCvtNsq : UarchTest
    {
        private int totalOps;
        public JsCvtNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "jscvtnsq";
            this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr d16, [x2, w25, sxtw #0]";
                string initInstrs = "  ldr d15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  fjcvtzs w15, d16";
                depInstrs[1] = "  fjcvtzs w14, d16";
                depInstrs[2] = "  fjcvtzs w13, d16";
                depInstrs[3] = "  fjcvtzs w12, d16";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  fjcvtzs w15, d15";
                indepInstrs[1] = "  fjcvtzs w14, d15";
                indepInstrs[2] = "  fjcvtzs w13, d15";
                indepInstrs[3] = "  fjcvtzs w12, d15";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/JsCvtSched.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class JsCvtSched : UarchTest
    {
        public JsCvtSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "jscvtsched";
            this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr d16, [x2, w25, sxtw #0]";
                string postLoadInstrs2 = "  ldr d16, [x2, w25, sxtw #0]";
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  fjcvtzs w15, d16";
                unrolledInstrs[1] = "  fjcvtzs w14, d16";
                unrolledInstrs[2] = "  fjcvtzs w13, d16";
                unrolledInstrs[3] = "  fjcvtzs w12, d16";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/JumpNsqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class JumpNsqTest : UarchTest
    {
        public JumpNsqTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "jumpnsq";
            this.Description = "Scheduler, Not-Taken Jumps, excluding possible nsq";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            // if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentJumps = new string[1];
                dependentJumps[0] = "  cmp %rdi, %rsi\n  je jumpnsq_reallybadthing";
                string[] independentJumps = new string[1];
                independentJumps[0] = "  cmp %r13, %r14\n  je jumpnsq_reallybadthing";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps);

                sb.AppendLine("jumpnsq_reallybadthing:\n  int3");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/JumpSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class JumpSchedTest : UarchTest
    {
        public JumpSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "jumpsched";
            this.Description = "Scheduler, Not-Taken Jumps";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledJumps = new string[1];
                unrolledJumps[0] = "  cmp %rdi, %rsi\n  je jumpsched_reallybadthing";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);

                sb.AppendLine("jumpsched_reallybadthing:\n  int3");
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledJumps = new string[1];
                unrolledJumps[0] = "  cmp x25, x26\n  b.eq jumpsched_reallybadthing";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
                sb.AppendLine("jumpsched_reallybadthing:\n  .word 0xf7f0a000");
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // todo
                string[] unrolledJumps = new string[1];
                unrolledJumps[0] = "  beq x5, x6, jumpsched_reallybadthing";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false);
                sb.AppendLine("jumpsched_reallybadthing:\n  .word 0x00000000");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/LdqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class LdqTest : UarchTest
    {
        bool initialDependentBranch;
        public LdqTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "ldq" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Load Queue" + (initialDependentBranch ? ", preceded by dependent branch"  : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledLoads = new string[4];
                unrolledLoads[0] = "  mov (%r8), %r15";
                unrolledLoads[1] = "  mov (%r8), %r14";
                unrolledLoads[2] = "  mov (%r8), %r13";
                unrolledLoads[3] = "  mov (%r8), %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstr = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] unrolledLoads = new string[4];
                unrolledLoads[0] = "  ldr x15, [x2]";
                unrolledLoads[1] = "  ldr x14, [x2]";
                unrolledLoads[2] = "  ldr x13, [x2]";
                unrolledLoads[3] = "  ldr x12, [x2]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledLoads = new string[4];
                unrolledLoads[0] = "  ld.d $r15, $r6, 0";
                unrolledLoads[1] = "  ld.d $r16, $r6, 8";
                unrolledLoads[2] = "  ld.d $r17, $r6, 16";
                unrolledLoads[3] = "  ld.d $r18, $r6, 24";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
                string[] unrolledLoads = new string[4];
                unrolledLoads[0] = "  ld x28, (x11)";
                unrolledLoads[1] = "  ld x29, 8(x11)";
                unrolledLoads[2] = "  ld x30, 16(x11)";
                unrolledLoads[3] = "  ld x31, 24(x11)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, 
                    includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/LeaSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class LeaSchedTest : UarchTest
    {
        public LeaSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "leasched";
            this.Description = "Scheduler, lea with base + index + offset";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  lea 128(%r15, %rdi), %r15";
                unrolledAdds[1] = "  lea 128(%r14, %rdi), %r14";
                unrolledAdds[2] = "  lea 128(%r13, %rdi), %r13";
                unrolledAdds[3] = "  lea 128(%r12, %rdi), %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/LoadNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class LoadNsq : UarchTest
    {
        public LoadNsq(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "loadnsq";
            this.Description = "Load Address Scheduler, Excluding any NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dep = new string[3];
                dep[0] = "  mov (%r8, %rdi, 4), %r15";
                dep[1] = "  mov (%r8, %rdi, 4), %r14";
                dep[2] = "  mov (%r8, %rdi, 4), %r13";

                string[] indep = new string[3];
                indep[0] = "  mov (%r8), %r15";
                indep[1] = "  mov (%r8), %r14";
                indep[2] = "  mov (%r8), %r13";

                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true);
            }
            if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dep = new string[3];
                dep[0] = "  ldr w15, [x2, w25, uxtw #2]";
                dep[1] = "  ldr w14, [x2, w25, uxtw #2]";
                dep[2] = "  ldr w13, [x2, w25, uxtw #2]";

                string[] indep = new string[3];
                indep[0] = "  ldr w12, [x2]";
                indep[1] = "  ldr w11, [x2]";
                indep[2] = "  ldr w10, [x2]";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/LoadSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class LoadSchedTest : UarchTest
    {
        public LoadSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "loadsched";
            this.Description = "Load Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  mov (%r8, %rdi, 4), %r15";
                dependentLoads[1] = "  mov (%r8, %rdi, 4), %r14";
                dependentLoads[2] = "  mov (%r8, %rdi, 4), %r13";
                dependentLoads[3] = "  mov (%r8, %rdi, 4), %r12";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  mov (%r8, %rsi, 4), %r15";
                dependentLoads1[1] = "  mov (%r8, %rsi, 4), %r14";
                dependentLoads1[2] = "  mov (%r8, %rsi, 4), %r13";
                dependentLoads1[3] = "  mov (%r8, %rsi, 4), %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  ldr w15, [x2, w25, uxtw #2]";
                dependentLoads[1] = "  ldr w14, [x2, w25, uxtw #2]";
                dependentLoads[2] = "  ldr w13, [x2, w25, uxtw #2]";
                dependentLoads[3] = "  ldr w12, [x2, w25, uxtw #2]";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  ldr w15, [x2, w26, uxtw #2]";
                dependentLoads1[1] = "  ldr w14, [x2, w26, uxtw #2]";
                dependentLoads1[2] = "  ldr w13, [x2, w26, uxtw #2]";
                dependentLoads1[3] = "  ldr w12, [x2, w26, uxtw #2]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string postLoadInstrs1 = "  andi $r19, $r12, 0xF\n  add.d $r19, $r19, $r6";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  ld.d $r15, $r19, 0";
                dependentLoads[1] = "  ld.d $r16, $r19, 8";
                dependentLoads[2] = "  ld.d $r17, $r19, 12";
                dependentLoads[3] = "  ld.d $r18, $r19, 16";

                string postLoadInstrs2 = "  andi $r19, $r13, 0xF\n  add.d $r19, $r19, $r6";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, 
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // x5 and x6 are pointer chasing loads
                string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12";
                string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  ld x28, (x7)";
                dependentLoads[1] = "  ld x29, 8(x7)";
                dependentLoads[2] = "  ld x30, 16(x7)";
                dependentLoads[3] = "  ld x31, 24(x7)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MaddSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MaddSchedTest : UarchTest
    {
        public MaddSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "maddsched";
            this.Description = "Scheduler, Integer Multiply-Add";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledMuls = new string[4];
                unrolledMuls[0] = "  madd x15, x15, x25, x10";
                unrolledMuls[1] = "  madd x14, x14, x25, x10";
                unrolledMuls[2] = "  madd x13, x13, x25, x10";
                unrolledMuls[3] = "  madd x12, x12, x25, x10";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MaskRfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MaskRfTest : UarchTest
    {
        public MaskRfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "maskrf";
            this.Description = "Mask Registers - AVX-512 only";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  kaddb %k0, %k1, %k1";
                unrolledAdds[1] = "  kaddb %k0, %k2, %k2";
                unrolledAdds[2] = "  kaddb %k0, %k3, %k3";
                unrolledAdds[3] = "  kaddb %k0, %k4, %k4";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixAddJumpSched.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixAddJumpSchedTest : UarchTest
    {
        public MixAddJumpSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixaddjumpsched";
            this.Description = "Scheduler, Mixed Adds and Not-Taken Jumps in 3:1 ratio";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledJumps = new string[4];
                unrolledJumps[0] = "  cmp %rdi, %rsi\n  je mixaddjumpsched_reallybadthing";
                unrolledJumps[1] = "  add %rsi, %r15";
                unrolledJumps[2] = "  add %rsi, %r14";
                unrolledJumps[3] = "  add %rsi, %r14";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);

                sb.AppendLine("mixaddjumpsched_reallybadthing:\n  int3");
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledJumps = new string[4];
                unrolledJumps[0] = "  cmp x25, x26\n  b.eq mixaddjumpsched_reallybadthing";
                unrolledJumps[1] = "  add x15, x15, x25";
                unrolledJumps[2] = "  add x14, x14, x25";
                unrolledJumps[3] = "  add x14, x14, x25";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
                sb.AppendLine("mixaddjumpsched_reallybadthing:\n  .word 0xf7f0a000");
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // todo
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  mul x30, x30, x5";
                unrolledAdds[1] = "  mul x29, x29, x5";
                unrolledAdds[2] = "  mul x28, x28, x5";
                unrolledAdds[3] = "  mul x31, x31, x5";

                string[] unrolledAdds1 = new string[4];
                unrolledAdds1[0] = "  mul x30, x30, x6";
                unrolledAdds1[1] = "  mul x31, x31, x6";
                unrolledAdds1[2] = "  mul x28, x28, x6";
                unrolledAdds1[3] = "  mul x29, x29, x6";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixAddvJsCvtNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixAddvJsCvtNsq : UarchTest
    {
        public MixAddvJsCvtNsq(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixaddvjscvtnsq";
            this.Description = "ADDV and fjcvtzs Scheduler, Excluding any NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr q16, [x2, w25, sxtw #0]\n  ldr d2, [x2, w25, sxtw #0]";
                string initInstrs = "  ldr q17, [x2]\n  ldr d15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  addv h1, v16.4h";
                depInstrs[1] = "  fjcvtzs w15, d2";
                depInstrs[2] = "  addv h3, v16.4h";
                depInstrs[3] = "  fjcvtzs w14, d2";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  addv h4, v17.4h";
                indepInstrs[1] = "  fjcvtzs w12, d15";
                indepInstrs[2] = "  addv h5, v17.4h";
                indepInstrs[3] = "  fjcvtzs w13, d15";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixAddvJsCvtSched.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixAddvJsCvtSched : UarchTest
    {
        public MixAddvJsCvtSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixaddvjscvtsched";
            this.Description = "ADDV and fjcvtzs Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr q16, [x2, w25, sxtw #0]\n  ldr d2, [x2, w25, sxtw #0]";
                string postLoadInstrs2 = "  ldr q16, [x2, w26, sxtw #0]\n  ldr d2, [x2, w26, sxtw #0]";
                string[] unrolledInstrs = new string[4];
                unrolledInstrs[0] = "  addv h1, v16.4h";
                unrolledInstrs[1] = "  fjcvtzs w15, d2";
                unrolledInstrs[2] = "  addv h3, v16.4h";
                unrolledInstrs[3] = "  fjcvtzs w14, d2";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixBranchStoreTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixBranchStoreTest : UarchTest
    {
        private bool mixNops;
        private bool initialDependentBranch;
        public MixBranchStoreTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixstqbob" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Mixed NT branches and stores" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ;
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.mixNops = mixNops;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = Prefix + Counts[i];

                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                sb.AppendLine("  mov x10, 6");

                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
                    sb.AppendLine($"  cmp x15, x10");
                    sb.AppendLine($"  b.eq {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
                    sb.AppendLine($"  cmp x15, x10");
                    sb.AppendLine($"  b.eq {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixFAdd256and32RfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixFAdd256and32RfTest : UarchTest
    {
        public MixFAdd256and32RfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "fadd256and32rf";
            this.Description = "Mixed 32-bit scalar and 256-bit FP RF capacity";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vmovups (%r8), %ymm0\n" +
                 "  movss (%r8), %xmm1\n" +
                 "  vmovups %ymm0, %ymm2\n" +
                 "  movss (%r8), %xmm3\n" +
                 "  vmovups %ymm0, %ymm4\n" +
                 "  movss (%r8), %xmm5\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  vaddps %ymm0, %ymm1, %ymm1";
                unrolledAdds[1] = "  addss %xmm5, %xmm2";
                unrolledAdds[2] = "  vaddps %ymm0, %ymm3, %ymm3";
                unrolledAdds[3] = "  addss %xmm5, %xmm4";

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string initInstrs = "";
                for (int regIdx = 0; regIdx < 32; regIdx++)
                {
                    initInstrs += "  xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
                    initInstrs += "  fld.s $f" + regIdx + ", $r6, " + regIdx * 4 + "\n";
                }

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  xvfadd.s $xr1, $xr1, $xr1";
                unrolledAdds[1] = "  fadd.s $f11, $f11, $f11";
                unrolledAdds[2] = "  xvfadd.s $xr3, $xr3, $xr3";
                unrolledAdds[3] = "  fadd.s $f12, $f12, $f12";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixFpRfDepBranchTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixFpRfDepBranchTest : UarchTest
    {
        private int interval;
        public MixFpRfDepBranchTest(int low, int high, int step, int interval)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixfprfdepbranch" + interval;
            this.Description = "FP Register File, with some dependent branches";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *fpArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.interval = interval;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string initInstrs = "  ldr s17, [x2]\n" +
                    "  ldr s18, [x2, 4]\n" +
                    "  ldr s19, [x2, 8]\n" +
                    "  ldr s20, [x2, 12]\n" +
                    "  ldr s21, [x2, 16]\n";

                List<string> unrolledAddsList = new List<string>();
                for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
                {
                    int regnum = 18 + (i % 4);
                    unrolledAddsList.Add($"  fadd s{regnum}, s{regnum}, s17");
                    if (i % interval == 0) unrolledAddsList.Add("  cmp x25, x26\n  b.eq mixfpjumpsched_badthing" + interval);
                }
                string[] unrolledAdds = unrolledAddsList.ToArray();
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs);

                sb.AppendLine($"mixfpjumpsched_badthing{interval}:\n  .word 0xf7f0a000");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixFpVecRfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixFpVecRfTest : UarchTest
    {
        private bool initialDependentBranch;
        public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixfpvecrf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Mixed FP/128-bit FP vec rf" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.riscv)
            {
                string initInstrs = "  vsetvli t5, t6, e32\n  vlw.v v0, (a1)\n    fld f0, (a1)";
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
                postLoadInstrs += "\n  mv t6, a2";
                string[] unrolledInstrs = new string[2];
                unrolledInstrs[0] = "  vfadd.vv v0, v0, v0";
                unrolledInstrs[1] = "  fadd.s f0, f0, f0";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,
                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixIntRfDepBranchTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixIntRfDepBranchTest : UarchTest
    {
        private int interval;
        public MixIntRfDepBranchTest(int low, int high, int step, int interval)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixintrfdepbranch" + interval;
            this.Description = "Integer Register File, with some dependent branches";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.interval = interval;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                List<string> unrolledAddsList = new List<string>();
                for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++)
                {
                    int regnum = 12 + (i % 4);
                    unrolledAddsList.Add($"  add x{regnum}, x{regnum}, x11");
                    if (i % interval == 0) unrolledAddsList.Add("  cmp x25, x26\n  b.eq mixintjumpsched_badthing" + interval);
                }
                string[] unrolledAdds = unrolledAddsList.ToArray();
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);

                sb.AppendLine($"mixintjumpsched_badthing{interval}:\n  .word 0xf7f0a000");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixIntVec128RfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixIntVec128RfTest : UarchTest
    {
        private bool initialDependentBranch;
        public MixIntVec128RfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixintvec128" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Mixed integer and 128-bit vector register file capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vmovups (%r8), %ymm0\n" +
                 "  movss (%r8), %xmm1\n" +
                 "  vmovups %ymm0, %ymm2\n" +
                 "  movss (%r8), %xmm3\n" +
                 "  vmovups %ymm0, %ymm4\n" +
                 "  movss (%r8), %xmm5\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add %r11, %r15";
                unrolledAdds[1] = "  addss %xmm5, %xmm2";
                unrolledAdds[2] = "  add %r11, %r14";
                unrolledAdds[3] = "  addss %xmm5, %xmm4";

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string initInstrs = "  ldr q0, [x1]\n" +
                "  ldr q1, [x1, #0x10]\n" +
                "  ldr q2, [x1, #0x20]\n" +
                "  ldr q3, [x1, #0x30]\n" +
                "  ldr q4, [x1, #0x40]\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  add v1.4s, v1.4s, v0.4s";
                unrolledAdds[1] = "  add x15, x15, x11";
                unrolledAdds[2] = "  add v2.4s, v2.4s, v0.4s";
                unrolledAdds[3] = "  add x14, x14, x11";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixIntrfFprfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixIntFpRfTest : UarchTest
    {
        private bool initialDependentBranch;
        public MixIntFpRfTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixintfprf" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Mixed INT/FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            //if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            //if (isa == IUarchTest.ISA.amd64) return true;
            //if (isa == IUarchTest.ISA.aarch64) return true;
            //if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // todo
                string initInstrs = "  movss (%r8), %xmm1\n" +
                    "  movss 4(%r8), %xmm2\n" +
                    "  movss 8(%r8), %xmm3\n" +
                    "  movss 12(%r8), %xmm4\n" +
                    "  movss 16(%r8), %xmm5\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  addss %xmm1, %xmm2";
                unrolledAdds[1] = "  addss %xmm1, %xmm3";
                unrolledAdds[2] = "  addss %xmm1, %xmm4";
                unrolledAdds[3] = "  addss %xmm1, %xmm5";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {// todo
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string initInstrs = "  ldr s17, [x2]\n" +
                    "  ldr s18, [x2, 4]\n" +
                    "  ldr s19, [x2, 8]\n" +
                    "  ldr s20, [x2, 12]\n" +
                    "  ldr s21, [x2, 16]\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd s18, s18, s17";
                unrolledAdds[1] = "  fadd s19, s19, s17";
                unrolledAdds[2] = "  fadd s20, s20, s17";
                unrolledAdds[3] = "  fadd s21, s21, s17";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {// todo
                string initInstrs = "  fld.s $f8, $r6, 0\n" +
                    "  fld.s $f9, $r6, 4\n" +
                    "  fld.s $f10, $r6, 8\n" +
                    "  fld.s $f11, $r6, 12\n" +
                    "  fld.s $f12, $r6, 16\n";

                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  fadd.s $f9, $f9, $f8";
                unrolledAdds[1] = "  fadd.s $f10, $f10, $f8";
                unrolledAdds[2] = "  fadd.s $f11, $f11, $f8";
                unrolledAdds[3] = "  fadd.s $f12, $f12, $f8";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
                string initInstrs = "  fld f0, (x12)\n" +
                    "  fld f1, 8(x12)\n" +
                    "  fld f2, 16(x12)\n" +
                    "  fld f3, 24(x12)\n" +
                    "  fld f4, 32(x12)\n";

                List<string> unrolledAdds = new List<string>();
                /* for C910 */
                for (int i = 0; i < 30; i++) unrolledAdds.Add($"  fadd.s f{i % 4}, f{i % 4}, f4");
                for (int i = 0; i < 200; i++) unrolledAdds.Add($"  add x28, x28, x29");
                /*unrolledAdds.Add("  fadd.s f0, f0, f4");
                unrolledAdds.Add("  add x28, x28, x29");
                unrolledAdds.Add("  fadd.s f1, f1, f4");
                unrolledAdds.Add("  add x30, x30, x29");
                unrolledAdds.Add("  fadd.s f2, f2, f4");
                unrolledAdds.Add("  add x31, x31, x29");
                unrolledAdds.Add("  fadd.s f3, f3, f4");
                unrolledAdds.Add("  add x18, x18, x29");*/
                string[] unrolledAddsArr = unrolledAdds.ToArray();
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAddsArr, unrolledAddsArr, 
                    includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixJumpStoreDataSched.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixJumpStoreDataSched : UarchTest
    {
        public MixJumpStoreDataSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixjumpstoredatasched";
            this.Description = "Scheduler, Mixed Jumps and Store Data";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            //if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledJumps = new string[4];
                unrolledJumps[0] = "  cmp %rdi, %rsi\n  je mixjumpstoredatasched_reallybadthing";
                unrolledJumps[1] = "  mov %rdi, (%r8)";
                unrolledJumps[2] = "  cmp %rdi, %rsi\n  je mixjumpstoredatasched_reallybadthing";
                unrolledJumps[3] = "  mov %rdi, 64(%r8)";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);

                sb.AppendLine("mixjumpstoredatasched_reallybadthing:\n  int3");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixJumpStoreSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixJumpStoreSchedTest : UarchTest
    {
        public MixJumpStoreSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixjumpstoresched";
            this.Description = "Scheduler, Mixed Jumps and Stores (Address Dependency)";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            //if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledJumps = new string[4];
                unrolledJumps[0] = "  cmp %rdi, %rsi\n  je mixstorejumpsched_reallybadthing";
                unrolledJumps[1] = "  mov %r14, (%r8, %rdi, 2)";
                unrolledJumps[2] = "  cmp %rdi, %rsi\n  je mixstorejumpsched_reallybadthing";
                unrolledJumps[3] = "  mov %r14, 64(%r8, %rdi, 2)";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);

                sb.AppendLine("mixstorejumpsched_reallybadthing:\n  int3");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixJumpThenAddSched.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixJumpThenAddSched : UarchTest
    {
        public MixJumpThenAddSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixjumpthenaddsched";
            this.Description = "Scheduler, 40 NT jumps + adds";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            // if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                List<string> unrolledJumps = new List<string>();
                int instrIdx;
                for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add("  cmp x25, x26\n  b.eq mixaddthenjumpsched_reallybadthing");
                for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add("  add x15, x15, x25");
                string[] instrs = unrolledJumps.ToArray();
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true);
                sb.AppendLine("mixaddthenjumpsched_reallybadthing:\n  .word 0xf7f0a000");
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixLdqStqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixLdqStqTest : UarchTest
    {
        private bool initialDependentBranch;
        public MixLdqStqTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixldqstq" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Mixed Load/Store Queue Test (mem ops pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr1";
            this.GetFunctionCallParameters = "structIterations, A, B";
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86GccAsm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            string[] instrs = new string[4];
            instrs[0] = "  mov %r15, (%r8)";
            instrs[1] = "  mov (%rdx), %r14";
            instrs[2] = "  mov %r13, (%r8)";
            instrs[3] = "  mov (%rdx), %r12";
            UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, true);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
            string[] instrs = new string[4];
            instrs[0] = "  str x15, [x2]";
            instrs[1] = "  ldr x14, [x1]";
            instrs[2] = "  str x13, [x2]";
            instrs[3] = "  ldr x12, [x1]";
            UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                sb, this.Counts, this.Prefix, instrs, instrs, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
            if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
        }
    }
}

================================================
FILE: AsmGen/tests/MixLoadStoreDivSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixLoadStoreDivSchedTest : UarchTest
    {
        public MixLoadStoreDivSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixloadstoredivsched";
            this.Description = "Load/Store Scheduler Capacity Test, using divs to block retirement";
            this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2, int *arr3";
            this.GetFunctionCallParameters = "structIterations, list_size, B, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86Asm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
        }

        public void GenerateX86Asm(StringBuilder sb)
        {
            string[] dependentLoads = new string[2];
            dependentLoads[0] = "  mov (%r9, %rdx, 4), %r15";
            dependentLoads[1] = "  mov %r14, (%r8, %rdx, 4)";

            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string[] dependentLoads = new string[2];
            dependentLoads[0] = "  ldr w15, [x3, w25, uxtw #2]";
            dependentLoads[1] = "  str w14, [x2, w25, uxtw #2]";

            string[] dependentLoads1 = new string[2];
            dependentLoads1[0] = "  ldr w15, [x3, w26, uxtw #2]";
            dependentLoads1[1] = "  str w14, [x2, w26, uxtw #2]";

            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false);
        }
    }
}


================================================
FILE: AsmGen/tests/MixLoadStoreSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixLoadStoreSched : UarchTest
    {
        public MixLoadStoreSched(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixloadstoresched";
            this.Description = "Mixed Load/Store Address Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  mov %r15, (%r8, %rdi, 4)";
                dependentLoads[1] = "  mov (%r8, %rdi, 2), %r14";
                dependentLoads[2] = "  mov %r13, (%r8, %rdi, 4)";
                dependentLoads[3] = "  mov (%r8, %rdi, 2), %r12";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  mov %r15, (%r8, %rsi, 4)";
                dependentLoads1[1] = "  mov (%r8, %rsi, 4), %r14";
                dependentLoads1[2] = "  mov %r13, (%r8, %rsi, 4)";
                dependentLoads1[3] = "  mov (%r8, %rsi, 4), %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  str w15, [x2, w25, uxtw #2]";
                dependentLoads[1] = "  ldr w14, [x1, w25, uxtw #0]";
                dependentLoads[2] = "  str w13, [x2, w25, uxtw #2]";
                dependentLoads[3] = "  ldr w12, [x1, w25, uxtw #0]";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  str w15, [x2, w26, uxtw #2]";
                dependentLoads1[1] = "  ldr w14, [x1, w26, uxtw #0]";
                dependentLoads1[2] = "  str w13, [x2, w26, uxtw #2]";
                dependentLoads1[3] = "  ldr w12, [x1, w26, uxtw #0]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // x5 and x6 are pointer chasing loads
                string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12";
                string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  sd x28, (a2)";
                dependentLoads[1] = "  ld x29, 8(a2)";
                dependentLoads[2] = "  sd x30, 16(a2)";
                dependentLoads[3] = "  ld x31, 24(a2)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/MixStoreDivSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MixStoreDivSchedTest : UarchTest
    {
        public MixStoreDivSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixstoresched";
            this.Description = "Store (Mixed Data/Address) Scheduler Capacity Test";
            this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2";
            this.GetFunctionCallParameters = "structIterations, list_size, B";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86Asm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
        }

        public void GenerateX86Asm(StringBuilder sb)
        {
            string[] dependentStores = new string[4];
            dependentStores[0] = "  mov %rdx, (%r8, %r15, 4)";
            dependentStores[1] = "  mov %r15, (%r8, %rdx, 4)";
            dependentStores[2] = "  mov %rdx, (%r8, %r15, 4)";
            dependentStores[3] = "  mov %r15, (%r8, %rdx, 4)";

            string[] dependentStores1 = new string[4];
            dependentStores1[0] = "  mov %rdx, (%r8, %r11, 4)";
            dependentStores1[1] = "  mov %r11, (%r8, %rdx, 4)";
            dependentStores1[2] = "  mov %rdx, (%r8, %r11, 4)";
            dependentStores1[3] = "  mov %r11, (%r8, %rdx, 4)";
            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string[] dependentStores = new string[4];
            dependentStores[0] = "  str w25, [x2, w15, uxtw #2]";
            dependentStores[1] = "  str w15, [x2, w25, uxtw #2]";
            dependentStores[2] = "  str w25, [x2, w15, uxtw #2]";
            dependentStores[3] = "  str w15, [x2, w25, uxtw #2]";

            string[] dependentStores1 = new string[4];
            dependentStores1[0] = "  str w26, [x2, w15, uxtw #2]";
            dependentStores1[1] = "  str w15, [x2, w26, uxtw #2]";
            dependentStores1[2] = "  str w26, [x2, w15, uxtw #2]";
            dependentStores1[3] = "  str w15, [x2, w26, uxtw #2]";
            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
        }
    }
}

================================================
FILE: AsmGen/tests/MixVec512Vec256BlockRfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixVec512Vec256BlockRfTest : UarchTest
    {
        // number of tiny registers
        private int nTiny;

        public MixVec512Vec256BlockRfTest(int low, int high, int step, int nTiny)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixvec512vec256blockrf" + nTiny;
            this.Description = $"Mixed zmm/ymm regs - AVX-512 only, {nTiny} 256-bit then 512-bit";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // use even numbered regs for ymm testing
                string initInstrs = "  vmovups (%r8), %zmm1\n" +
                "  vmovups 64(%r8), %ymm2\n" +
                "  vmovups 128(%r8), %zmm3\n" +
                "  vmovups 192(%r8), %ymm4\n" +
                "  vmovups 256(%r8), %zmm5\n";

                // use all zmm regs
                for (int i = 6; i < 32; i++)
                {
                    if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n";
                    else initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
                }

                List<string> instrsList = new List<string>();
                for (int i = 0; i < nTiny; i++)
                {
                    int regNum = ((i & 1) == 0) ? i & 0x1F : (i + 1) & 0x1F;
                    instrsList.Add($"  vxorps %ymm2, %ymm{regNum}, %ymm{regNum}");
                }

                for (int i = nTiny; i < this.Counts[this.Counts.Length - 1];i++)
                {
                    int regNum = ((i & 1) == 0) ? i: (i + 1);
                    regNum = (regNum + 1) & 0x1F;
                    instrsList.Add($"  vxorps %zmm1, %zmm{regNum}, %zmm{regNum}");
                }

                string[] unrolledAdds = instrsList.ToArray();
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
            }
        }
    }
}

================================================
FILE: AsmGen/tests/MixVec512Vec256RfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class MixVec512Vec256RfTest : UarchTest
    {
        public MixVec512Vec256RfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mixvec512vec256rf";
            this.Description = "Mixed zmm/ymm regs - AVX-512 only, alternating";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // use even numbered regs for ymm testing
                string initInstrs = "  vmovups (%r8), %zmm1\n" +
                "  vmovups 64(%r8), %ymm2\n" +
                "  vmovups 128(%r8), %zmm3\n" +
                "  vmovups 192(%r8), %ymm4\n" +
                "  vmovups 256(%r8), %zmm5\n";

                // use all zmm regs
                for (int i = 6; i < 32; i++)
                {
                    if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n";
                    else initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
                }

                List<string> instrsList = new List<string>();
                for (int i = 1; i < 32; i++)
                {
                    if ((i & 1) == 0) instrsList.Add($"  vaddps %ymm2, %ymm{i}, %ymm{i}");
                    else instrsList.Add($"  vaddps %zmm1, %zmm{i}, %zmm{i}");
                }

                string[] unrolledAdds = instrsList.ToArray();
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
            }
        }
    }
}

================================================
FILE: AsmGen/tests/MmxRfTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MmxRfTest : UarchTest
    {
        public MmxRfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mmxrf";
            this.Description = "64-bit MMX RF Capacity Test. x86 only";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr2";
            this.GetFunctionCallParameters = "structIterations, A, B";
            this.DivideTimeByCount = false;
        }
        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            string initInstrs = 
                "  fsave (%r8)\n" +
                "  movq (%rdx), %mm0\n" +
                "  movq 8(%rdx), %mm1\n" +
                "  movq 16(%rdx), %mm2\n" +
                "  movq 24(%rdx), %mm3\n" +
                "  movq 32(%rdx), %mm4\n";

            string cleanupInstrs = "  frstor (%r8)";

            string[] unrolledAdds = new string[4];
            unrolledAdds[0] = "  paddw %mm0, %mm1";
            unrolledAdds[1] = "  paddw %mm0, %mm2";
            unrolledAdds[2] = "  paddw %mm0, %mm3";
            unrolledAdds[3] = "  paddw %mm0, %mm4";

            UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
                sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs);
        }
    }
}

================================================
FILE: AsmGen/tests/MulSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class MulSchedTest : UarchTest
    {
        public MulSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "mulsched";
            this.Description = "Scheduler, Integer Multiplies";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledMuls = new string[4];
                unrolledMuls[0] = "  imul %rdi, %r15";
                unrolledMuls[1] = "  imul %rdi, %r14";
                unrolledMuls[2] = "  imul %rdi, %r13";
                unrolledMuls[3] = "  imul %rdi, %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] unrolledMuls = new string[4];
                unrolledMuls[0] = "  mul x15, x15, x25";
                unrolledMuls[1] = "  mul x14, x14, x25";
                unrolledMuls[2] = "  mul x13, x13, x25";
                unrolledMuls[3] = "  mul x12, x12, x25";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  mul.d $r15, $r15, $r12";
                unrolledAdds[1] = "  mul.d $r16, $r16, $r12";
                unrolledAdds[2] = "  mul.d $r17, $r17, $r12";
                unrolledAdds[3] = "  mul.d $r18, $r18, $r12";

                string[] unrolledAdds1 = new string[4];
                unrolledAdds1[0] = "  mul.d $r15, $r15, $r13";
                unrolledAdds1[1] = "  mul.d $r16, $r16, $r13";
                unrolledAdds1[2] = "  mul.d $r17, $r17, $r13";
                unrolledAdds1[3] = "  mul.d $r18, $r18, $r13";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string[] unrolledMuls = new string[4];
                unrolledMuls[0] = "  mul x30, x30, x5";
                unrolledMuls[1] = "  mul x29, x29, x5";
                unrolledMuls[2] = "  mul x28, x28, x5";
                unrolledMuls[3] = "  mul x31, x31, x5";

                string[] unrolledMuls1 = new string[4];
                unrolledMuls1[0] = "  mul x30, x30, x6";
                unrolledMuls1[1] = "  mul x31, x31, x6";
                unrolledMuls1[2] = "  mul x28, x28, x6";
                unrolledMuls1[3] = "  mul x29, x29, x6";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls1, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/NopLoopTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class NopLoopTest : UarchTest
    {
        /// <summary>
        ///
        /// </summary>
        /// <param name="low">must be greater than 2</param>
        /// <param name="high"></param>
        /// <param name="step"></param>
        public NopLoopTest(int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(3, high, step);
            this.Prefix = "noploop";
            this.Description = $"NOP throughput for various loop sizes";
            this.FunctionDefinitionParameters = "uint64_t iterations";
            this.GetFunctionCallParameters = "structIterations";
            this.DivideTimeByCount = true;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return false;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
            if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = this.Prefix + this.Counts[i];
                sb.AppendLine(funcName + ":");

                // count dec, jnz as instructions in the loop
                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine("  nop");
                sb.AppendLine("  dec %rdi");
                sb.AppendLine("  jnz " + funcName);
                sb.AppendLine("  ret");
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = this.Prefix + this.Counts[i];
                sb.AppendLine(funcName + ":");

                // count dec, jnz as instructions in the loop
                for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine("  nop");
                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName);
                sb.AppendLine("  ret");
            }
        }
    }
}

================================================
FILE: AsmGen/tests/PdepSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class PdepSchedTest : UarchTest
    {
        public PdepSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "pdepsched";
            this.Description = "Scheduler, PDEP";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  pdep %rdi, %r15, %r15";
                unrolledAdds[1] = "  pdep %rdi, %r14, %r14";
                unrolledAdds[2] = "  pdep %rdi, %r13, %r13";
                unrolledAdds[3] = "  pdep %rdi, %r12, %r12";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/ReturnStackTest.cs
================================================
﻿using System;
using System.Text;

namespace AsmGen
{
    public class ReturnStackTest : UarchTest
    {
        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public ReturnStackTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "returnstack";
            this.Description = "Return Stack Depth Test";
            this.FunctionDefinitionParameters = "uint64_t iterations";
            this.GetFunctionCallParameters = "structIterations";
            this.DivideTimeByCount = true;
        }

        private string GetFunctionName(int count, int depth) { return $"returnstack{count}_{depth}"; }

        private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }
        public string GetLabelName(string funcName, int part) { return funcName + "part" + part; }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86GccAsm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                GenerateMipsAsm(sb);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                GenerateRiscvAsm(sb);
            }
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
            {
                int callDepth = this.Counts[countIdx];
                string topLevelFunctionLabel = this.Prefix + callDepth;
                sb.AppendLine($"{topLevelFunctionLabel}:");
                sb.AppendLine("  xor %rax, %rax");
                sb.AppendLine($"{topLevelFunctionLabel}_loop:");
                sb.AppendLine($"  call " + GetFunctionName(callDepth, 0));
                sb.AppendLine($"  dec %rdi");
                sb.AppendLine($"  jne {topLevelFunctionLabel}_loop");
                sb.AppendLine("  ret");

                // generate a batch of functions so we aren't returning to the same address
                // otherwise a simple predictor will suffice
                for (int callIdx = 0; callIdx < callDepth; callIdx++)
                {
                    string funcName = GetFunctionName(callDepth, callIdx);
                    sb.AppendLine($".global {funcName}");
                    sb.AppendLine(".align 128"); // https://github.com/clamchowder/Microbenchmarks/issues/14
                    sb.AppendLine($"{funcName}:");
                    if (callIdx < callDepth - 1)
                    {
                        sb.AppendLine($"  add %rdi, %rax");
                        sb.AppendLine("  call " + GetFunctionName(callDepth, callIdx + 1));
                    }

                    sb.AppendLine(".align 128");
                    sb.AppendLine("  ret");
                }
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
            {
                int callDepth = this.Counts[countIdx];
                string topLevelFunctionLabel = this.Prefix + callDepth;
                sb.AppendLine($"{topLevelFunctionLabel}:");
                sb.AppendLine("  sub sp, sp, #0x20");
                sb.AppendLine("  stp x29, x30, [sp, #0x10]");
                sb.AppendLine("  eor x3, x3, x3");
                sb.AppendLine($"{topLevelFunctionLabel}_loop:");
                sb.AppendLine($"  bl " + GetFunctionName(callDepth, 0));
                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine($"  cbnz x0, {topLevelFunctionLabel}_loop");
                sb.AppendLine("  ldp x29, x30, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x20");
                sb.AppendLine("  ret");

                for (int callIdx = 0; callIdx < callDepth; callIdx++)
                {
                    string funcName = GetFunctionName(callDepth, callIdx);
                    sb.AppendLine($".global {funcName}");
                    sb.AppendLine($"{funcName}:");
                    sb.AppendLine($"  add x3, x3, x0");
                    if (callIdx < callDepth - 1)
                    {
                        // 'bl' is like x86 'call', except it's like the kid that falls asleep in the middle of class
                        // it doesn't push the return address, so you have to do that yourself
                        sb.AppendLine("  sub sp, sp, #0x20");
                        sb.AppendLine("  stp x29, x30, [sp, #0x10]");
                        sb.AppendLine("  bl " + GetFunctionName(callDepth, callIdx + 1));
                        sb.AppendLine("  ldp x29, x30, [sp, #0x10]");
                        sb.AppendLine("  add sp, sp, #0x20");
                    }

                    sb.AppendLine("  ret");
                }
            }
        }

        public void GenerateMipsAsm(StringBuilder sb)
        {
            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
            {
                int callDepth = this.Counts[countIdx];
                string topLevelFunctionLabel = this.Prefix + callDepth;
                sb.AppendLine($"{topLevelFunctionLabel}:");
                // top level function runs for specified number of iterations
                sb.AppendLine("  xor $r12, $r12, $r12");
                sb.AppendLine("  xor $r13, $r13, $r13");
                sb.AppendLine("  addi.d $r12, $r12, 1");
                sb.AppendLine("  addi.d $r13, $r13, 8");
                sb.AppendLine("  sub.d $sp, $sp, $r13");
                sb.AppendLine("  st.d $r1, $sp, 0");
                sb.AppendLine($"{topLevelFunctionLabel}_loop:");

                // mips stack grows down
                sb.AppendLine($"  bl " + GetFunctionName(callDepth, 0));
                sb.AppendLine("  sub.d $r4, $r4, $r12");
                sb.AppendLine($"  bnez $r4, {topLevelFunctionLabel}_loop");
                sb.AppendLine("  ld.d $r1, $sp, 0");
                sb.AppendLine("  add.d $sp, $sp, $r13");
                sb.AppendLine("  jr $r1");

                // generate the dummy functions
                for (int callIdx = 0; callIdx < callDepth; callIdx++)
                {
                    string funcName = GetFunctionName(callDepth, callIdx);
                    sb.AppendLine($".global {funcName}");
                    sb.AppendLine($"{funcName}:");
                    if (callIdx < callDepth - 1)
                    {
                        sb.AppendLine("  sub.d $sp, $sp, $r13");
                        sb.AppendLine("  st.d $r1, $sp, 0"); // save return address
                        sb.AppendLine("  bl " + GetFunctionName(callDepth, callIdx + 1));
                        sb.AppendLine("  ld.d $r1, $sp, 0"); // load return address
                        sb.AppendLine("  add.d $sp, $sp, $r13");
                    }

                    sb.AppendLine("  jr $r1");
                }
            }
        }

        public void GenerateRiscvAsm(StringBuilder sb)
        {
            for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
            {
                int callDepth = this.Counts[countIdx];
                string topLevelFunctionLabel = this.Prefix + callDepth;
                sb.AppendLine($"{topLevelFunctionLabel}:");
                // top level function runs for specified number of iterations
                // iteration count in x10
                sb.AppendLine("  addi sp, sp, -16");
                sb.AppendLine("  sd ra, (sp)");
                sb.AppendLine($"{topLevelFunctionLabel}_loop:");
                sb.AppendLine($"  jal " + GetFunctionName(callDepth, 0));
                sb.AppendLine("  addi x10, x10, -1");
                sb.AppendLine($"  bge x10, x0, {topLevelFunctionLabel}_loop");
                sb.AppendLine("  ld ra, (sp)");
                sb.AppendLine("  addi sp, sp, 16");
                sb.AppendLine("  ret");

                // generate the dummy functions
                for (int callIdx = 0; callIdx < callDepth; callIdx++)
                {
                    string funcName = GetFunctionName(callDepth, callIdx);
                    sb.AppendLine($".global {funcName}");
                    sb.AppendLine($"{funcName}:");
                    if (callIdx < callDepth - 1)
                    {
                        sb.AppendLine("  addi sp, sp, -16"); // keep stack pointer 16B aligned even though we only save a 8B reg
                        sb.AppendLine("  sd ra, (sp)"); // save return address
                        sb.AppendLine("  jal " + GetFunctionName(callDepth, callIdx + 1));
                        sb.AppendLine("  ld ra, (sp)"); // load return address
                        sb.AppendLine("  addi sp, sp, 16");
                    }

                    sb.AppendLine("  ret");
                }
            }
        }
    }
}


================================================
FILE: AsmGen/tests/RobTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class RobTest : UarchTest
    {
        private string[] nops;
        private bool initialDependentBranch;
        public RobTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "rob" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Reorder Buffer Test" + (initialDependentBranch ? " preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.nops = new string[] { "nop" };
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/RorSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class RorSchedTest : UarchTest
    {
        public RorSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "rorsched";
            this.Description = "Scheduler, Integer Rotate by Immediate (1)";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string postLoadInstrs = "  mov %rdi, %r15";
                string postLoadInstrs2 = "  mov %rsi, %r15";
                string[] unrolledInstrs = new string[1];
                unrolledInstrs[0] = "  ror $1, %r15";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
                    sb, 
                    this.Counts, 
                    this.Prefix, 
                    unrolledInstrs, 
                    unrolledInstrs, 
                    postLoadInstrs1: postLoadInstrs, 
                    postLoadInstrs2: postLoadInstrs2, 
                    includePtrChasingLoads: false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/ShlSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class ShlSchedTest : UarchTest
    {
        public ShlSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "shlsched";
            this.Description = "Scheduler, Integer Shift by Immediate (1)";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string postLoadInstrs = "  mov %rdi, %r15";
                string postLoadInstrs2 = "  mov %rsi, %r15";
                string[] unrolledInstrs = new string[1];
                unrolledInstrs[0] = " shl $1, %r15";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
                    sb, 
                    this.Counts, 
                    this.Prefix, 
                    unrolledInstrs, 
                    unrolledInstrs, 
                    postLoadInstrs1: postLoadInstrs, 
                    postLoadInstrs2: postLoadInstrs2, 
                    includePtrChasingLoads: false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreDataDivNsqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreDataDivNsqTest : UarchTest
    {
        public StoreDataDivNsqTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storedatadivnsq";
            this.Description = "Store Data Scheduler, using DIVs to block retirement";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // idiv puts remainder in RDX
                string[] dependentStores = new string[4];
                dependentStores[0] = "  mov %rdx, (%r8, %r15, 4)";
                dependentStores[1] = "  mov %rdx, (%r8, %r15, 4)";
                dependentStores[2] = "  mov %rdx, (%r8, %r15, 4)";
                dependentStores[3] = "  mov %rdx, (%r8, %r15, 4)";

                string[] independentStores = new string[4];
                independentStores[0] = "  mov %r14, (%r8, %r11, 4)";
                independentStores[1] = "  mov %r14, (%r8, %r11, 4)";
                independentStores[2] = "  mov %r14, (%r8, %r11, 4)";
                independentStores[3] = "  mov %r14, (%r8, %r11, 4)";
                UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentStores = new string[1];
                dependentStores[0] = "  str w25, [x2, w15, uxtw #2]";

                string[] independentStores = new string[1];
                independentStores[0] = "  str w15, [x2, w15, uxtw #2]";

                UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreDataNsqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreDataNsq : UarchTest
    {
        public StoreDataNsq(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storedatansq";
            this.Description = "Store Data Scheduler, excluding NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            // if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  mov %rdi, (%r8)";
                dependentLoads[1] = "  mov %rdi, 8(%r8)";
                dependentLoads[2] = "  mov %rdi, 16(%r8)";
                dependentLoads[3] = "  mov %rdi, 24(%r8)";

                string[] independentLoads = new string[4];
                independentLoads[0] = "  mov %r14, (%r8)";
                independentLoads[1] = "  mov %r14, 8(%r8)";
                independentLoads[2] = "  mov %r14, 16(%r8)";
                independentLoads[3] = "  mov %r14, 24(%r8)";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreDataSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreDataSchedTest : UarchTest
    {
        public StoreDataSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storedatasched";
            this.Description = "Store Data Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  mov %rdi, (%r8)";
                dependentLoads[1] = "  mov %rdi, 8(%r8)";
                dependentLoads[2] = "  mov %rdi, 16(%r8)";
                dependentLoads[3] = "  mov %rdi, 24(%r8)";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  mov %rsi, (%r8)";
                dependentLoads1[1] = "  mov %rsi, 8(%r8)";
                dependentLoads1[2] = "  mov %rsi, 16(%r8)";
                dependentLoads1[3] = "  mov %rsi, 24(%r8)";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  str w25, [x2, 8]";
                dependentLoads[1] = "  str w25, [x2, 16]";
                dependentLoads[2] = "  str w25, [x2, 24]";
                dependentLoads[3] = "  str w25, [x2, 32]";

                string[] dependentLoads1 = new string[4];
                dependentLoads1[0] = "  str w26, [x2, 8]";
                dependentLoads1[1] = "  str w26, [x2, 16]";
                dependentLoads1[2] = "  str w26, [x2, 24]";
                dependentLoads1[3] = "  str w26, [x2, 32]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string postLoadInstrs1 = "  andi $r19, $r12, 0xF\n  add.d $r19, $r19, $r6";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  ld.d $r15, $r19, 0";
                dependentLoads[1] = "  ld.d $r16, $r19, 8";
                dependentLoads[2] = "  ld.d $r17, $r19, 12";
                dependentLoads[3] = "  ld.d $r18, $r19, 16";

                string postLoadInstrs2 = "  andi $r19, $r13, 0xF\n  add.d $r19, $r19, $r6";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, 
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // x5 and x6 are pointer chasing loads
                string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12";
                string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  ld x28, (x7)";
                dependentLoads[1] = "  ld x29, 8(x7)";
                dependentLoads[2] = "  ld x30, 16(x7)";
                dependentLoads[3] = "  ld x31, 24(x7)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreDivNsqTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreDivNsqTest : UarchTest
    {
        public StoreDivNsqTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storedivnsq";
            this.Description = "Store Scheduler, using DIVs to block retirement, excluding NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // idiv puts remainder in RDX
                string[] dependentStores = new string[4];
                dependentStores[0] = "  mov %r15w, (%r8, %rdx, 2)";
                dependentStores[1] = "  mov %r15w, 2(%r8, %rdx, 2)";
                dependentStores[2] = "  mov %r15w, 4(%r8, %rdx, 2)";
                dependentStores[3] = "  mov %r15w, 6(%r8, %rdx, 2)";

                string[] indepStores = new string[4];
                indepStores[0] = "  mov %r11w, (%r8)";
                indepStores[1] = "  mov %r11w, 2(%r8)";
                indepStores[2] = "  mov %r11w, 4(%r8)";
                indepStores[3] = "  mov %r11w, 6(%r8)";
                UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentStores = new string[1];
                dependentStores[0] = "  str w15, [x2, w25, uxtw #2]";

                string[] independentStores = new string[1];
                independentStores[0] = "  str w15, [x2, w15, uxtw #2]";

                UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreDivSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreDivSchedTest : UarchTest
    {
        public StoreDivSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storedivsched";
            this.Description = "Store Address Scheduler Capacity Test, using divs to block retirement";
            this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2";
            this.GetFunctionCallParameters = "structIterations, list_size, B";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                GenerateX86Asm(sb);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                GenerateArmAsm(sb);
            }
        }

        public void GenerateX86Asm(StringBuilder sb)
        {
            string[] dependentStores = new string[4];
            dependentStores[0] = "  mov %r15, (%r8, %rdx, 4)";
            dependentStores[1] = "  mov %r15, (%r8, %rdx, 4)";
            dependentStores[2] = "  mov %r15, (%r8, %rdx, 4)";
            dependentStores[3] = "  mov %r15, (%r8, %rdx, 4)";

            string[] dependentStores1 = new string[4];
            dependentStores1[0] = "  mov %r11, (%r8, %rdx, 4)";
            dependentStores1[1] = "  mov %r11, (%r8, %rdx, 4)";
            dependentStores1[2] = "  mov %r11, (%r8, %rdx, 4)";
            dependentStores1[3] = "  mov %r11, (%r8, %rdx, 4)";

            // instead of using pointer chasing loads, use a nasty block of chained integer divisions to block retirement
            // some older/less capable architectures will not reorder loads ahead of stores with unknown addresses,
            // which breaks the usual technique
            UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string[] dependentStores = new string[4];
            dependentStores[0] = "  str w15, [x2, w25, uxtw #2]";
            dependentStores[1] = "  str w15, [x2, w25, uxtw #2]";
            dependentStores[2] = "  str w15, [x2, w25, uxtw #2]";
            dependentStores[3] = "  str w15, [x2, w25, uxtw #2]";

            string[] dependentStores1 = new string[4];
            dependentStores1[0] = "  str w15, [x2, w26, uxtw #2]";
            dependentStores1[1] = "  str w15, [x2, w26, uxtw #2]";
            dependentStores1[2] = "  str w15, [x2, w26, uxtw #2]";
            dependentStores1[3] = "  str w15, [x2, w26, uxtw #2]";
            UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
        }
    }
}

================================================
FILE: AsmGen/tests/StoreNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreNsq : UarchTest
    {
        public StoreNsq(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storensq";
            this.Description = "Store Address Scheduler, Excluding any NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64)
            {
                string[] depStores = new string[4];
                depStores[0] = "  str w15, [x2, w25, uxtw #2]";
                depStores[1] = "  str w14, [x2, w25, uxtw #2]";
                depStores[2] = "  str w13, [x2, w25, uxtw #2]";
                depStores[3] = "  str w12, [x2, w25, uxtw #2]";

                string[] indepStores = new string[4];
                indepStores[0] = "  str w15, [x2, w26, uxtw #2]";
                indepStores[1] = "  str w14, [x2, w26, uxtw #2]";
                indepStores[2] = "  str w13, [x2, w26, uxtw #2]";
                indepStores[3] = "  str w12, [x2, w26, uxtw #2]";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StoreSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class StoreSchedTest : UarchTest
    {
        public StoreSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "storesched";
            this.Description = "Store Address Scheduler";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] dependentStores = new string[4];
                dependentStores[0] = "  mov %r15, (%r8, %rdi, 4)";
                dependentStores[1] = "  mov %r14, (%r8, %rdi, 4)";
                dependentStores[2] = "  mov %r13, (%r8, %rdi, 4)";
                dependentStores[3] = "  mov %r12, (%r8, %rdi, 4)";

                string[] dependentStores1 = new string[4];
                dependentStores1[0] = "  mov %r15, (%r8, %rsi, 4)";
                dependentStores1[1] = "  mov %r14, (%r8, %rsi, 4)";
                dependentStores1[2] = "  mov %r13, (%r8, %rsi, 4)";
                dependentStores1[3] = "  mov %r12, (%r8, %rsi, 4)";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string[] dependentStores = new string[4];
                dependentStores[0] = "  str w15, [x2, w25, uxtw #2]";
                dependentStores[1] = "  str w14, [x2, w25, uxtw #2]";
                dependentStores[2] = "  str w13, [x2, w25, uxtw #2]";
                dependentStores[3] = "  str w12, [x2, w25, uxtw #2]";

                string[] dependentStores1 = new string[4];
                dependentStores1[0] = "  str w15, [x2, w26, uxtw #2]";
                dependentStores1[1] = "  str w14, [x2, w26, uxtw #2]";
                dependentStores1[2] = "  str w13, [x2, w26, uxtw #2]";
                dependentStores1[3] = "  str w12, [x2, w26, uxtw #2]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // x5 and x6 are pointer chasing loads
                string postLoadInstrs1 = "  andi x7, x5, 0xF\n  add x7, x7, x12";
                string postLoadInstrs2 = "  andi x7, x6, 0xF\n  add x7, x7, x12";
                string[] dependentLoads = new string[4];
                dependentLoads[0] = "  sd x28, (a2)";
                dependentLoads[1] = "  sd x29, 8(a2)";
                dependentLoads[2] = "  sd x30, 16(a2)";
                dependentLoads[3] = "  sd x31, 24(a2)";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
                    postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Stq128Test.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class Stq128Test : UarchTest
    {
        private bool initialDependentBranch;
        public Stq128Test(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "stq128" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Store Queue with 128-bit stores" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  movups (%rdx), %xmm1";
                string[] unrolledStores = new string[4];
                unrolledStores[0] = "  movaps %xmm1, (%r8)";
                unrolledStores[1] = "  movaps %xmm1, (%r8)";
                unrolledStores[2] = "  movaps %xmm1, (%r8)";
                unrolledStores[3] = "  movaps %xmm1, (%r8)";
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, initInstrs: initInstrs, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string initInstrs = "  ldr q0, [x1]";
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] unrolledStores = new string[4];
                unrolledStores[0] = "  str q0, [x2]";
                unrolledStores[1] = "  str q0, [x2]";
                unrolledStores[2] = "  str q0, [x2]";
                unrolledStores[3] = "  str q0, [x2]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string initInstrs = "  mv t6, x0\n  addi t6, t6, 16\n  vsetvli t5, t6, e32\n  vlw.v v0, (a1)";
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
                postLoadInstrs += "\n  mv t6, a2";
                string[] unrolledStores = new string[1];
                unrolledStores[0] = "  vsw.v v0, (t6)\n  addi t6, t6, 64";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,
                    initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Stq512Test.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class Stq512Test : UarchTest
    {
        private bool differentLines;
        public Stq512Test(int low, int high, int step, bool differentLines)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "stq512" + (differentLines ? "dl" : string.Empty);
            this.Description = "Store Queue with 512-bit stores - AVX-512 only";
            if (differentLines) this.Description += " with multiple lines";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.differentLines = differentLines;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string initInstrs = "  vmovaps (%r8), %zmm0\n  vmovaps %zmm0, %zmm1";
                string[] unrolledStores;
                if (differentLines)
                {
                    List<string> unrolledStoresList = new List<string>();
                    int maxOffset = 512, currentOffset = 0;
                    for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
                    {
                        string loadOffset = currentOffset > 0 ? currentOffset.ToString() : string.Empty;
                        string nextInstr = $"  vmovaps %zmm0, {loadOffset}(%r8)";
                        unrolledStoresList.Add(nextInstr);
                        if (currentOffset >= maxOffset)
                        {
                            currentOffset = 0;
                        }
                        else currentOffset += 64;
                        unrolledStoresList.Add("  vmovaps %zmm0, (%r8)");
                    }

                    unrolledStores = unrolledStoresList.ToArray();
                }
                else
                {
                    unrolledStores = new string[2];
                    unrolledStores[0] = "  vmovaps %zmm0, (%r8)";
                    unrolledStores[1] = "  vmovaps %zmm1, (%r8)";
                }

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, initInstrs: initInstrs);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/StqTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class StqTest : UarchTest
    {
        private bool initialDependentBranch;
        private bool spaced;

        public StqTest(int low, int high, int step, bool initialDependentBranch, bool spaced)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "stq" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Store Queue" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
            this.spaced = spaced;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch)
            {
                if (isa == IUarchTest.ISA.aarch64) return true;
                if (isa == IUarchTest.ISA.riscv) return true;
                return false;
            }

            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] unrolledStores;
                string postLoadInstrs = "";
                if (spaced)
                {
                    postLoadInstrs = "mov %r8, %r11";
                    List<string> storeInstrs = new List<string>();
                    for (int i = 0; i < this.Counts[Counts.Length - 1]; i++)
                    {
                        // Send to different cache lines
                        storeInstrs.Add("  mov %r15, (%r11)\n  add $64, %r11");
                    }

                    unrolledStores = storeInstrs.ToArray();
                }
                else
                {
                    unrolledStores = new string[4];
                    unrolledStores[0] = "  mov %r15, (%r8)";
                    unrolledStores[1] = "  mov %r14, (%r8)";
                    unrolledStores[2] = "  mov %r13, (%r8)";
                    unrolledStores[3] = "  mov %r12, (%r8)";
                }

                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
                    sb, 
                    this.Counts, 
                    this.Prefix, 
                    unrolledStores, 
                    unrolledStores, 
                    postLoadInstrs1: postLoadInstrs, 
                    postLoadInstrs2: postLoadInstrs, 
                    includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] unrolledStores = new string[4];
                unrolledStores[0] = "  str x15, [x2]";
                unrolledStores[1] = "  str x14, [x2]";
                unrolledStores[2] = "  str x13, [x2]";
                unrolledStores[3] = "  str x12, [x2]";
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
                    sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] unrolledStores = new string[4];
                unrolledStores[0] = "  st.d $r15, $r6, 0";
                unrolledStores[1] = "  st.d $r16, $r6, 0";
                unrolledStores[2] = "  st.d $r17, $r6, 0";
                unrolledStores[3] = "  st.d $r18, $r6, 0";
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
                string[] unrolledStores;
                if (this.spaced)
                {
                    List<string> stores = new List<string>();
                    for (int i = 0; i < 32; i++)
                    {
                        stores.Add($"  sd x28, {i * 16}(x12)");
                    }

                    unrolledStores = stores.ToArray();
                }
                else
                {
                    unrolledStores = new string[4];
                    unrolledStores[0] = "  sd x28, (x12)";
                    unrolledStores[1] = "  sd x29, 8(x12)";
                    unrolledStores[2] = "  sd x30, 16(x12)";
                    unrolledStores[3] = "  sd x31, 24(x12)";
                }

                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,
                    postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
            }
        }
    }
}


================================================
FILE: AsmGen/tests/TakenBranchBufferTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class TakenBranchBufferTest : UarchTest
    {
        private bool initialDependentBranch;
        public TakenBranchBufferTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "tbb" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Taken Branch Buffer Test (taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
            else if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
        }

        public void GenerateX86GccAsm(StringBuilder sb)
        {
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = Prefix + Counts[i];
                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  push %rsi");
                sb.AppendLine("  push %rdi");
                sb.AppendLine("  push %r15");
                sb.AppendLine("  push %r14");
                sb.AppendLine("  push %r13");
                sb.AppendLine("  push %r12");
                sb.AppendLine("  push %r11");
                sb.AppendLine("  push %r8");
                sb.AppendLine("  push %rcx");
                sb.AppendLine("  push %rdx");

                // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                // move them into familiar windows argument regs (rcx, rdx, r8)
                sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                sb.AppendLine("  xor %r15, %r15");
                sb.AppendLine("  mov $0x1, %r14");
                sb.AppendLine("  mov $0x2, %r13");
                sb.AppendLine("  mov $0x3, %r12");
                sb.AppendLine("  mov $0x4, %r11");

                sb.AppendLine("  xor %rdi, %rdi");
                sb.AppendLine("  mov $0x40, %esi");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_edi_target{fillerIdx}";
                    sb.AppendLine($"  jmp {jumpLabel}");
                    sb.AppendLine(".align 16");
                    if (fillerIdx % 2 == 0) sb.AppendLine("  nop");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_esi_target{fillerIdx}";
                    sb.AppendLine($"  jmp {jumpLabel}");
                    // try to space the jumps out a bit
                    sb.AppendLine(".align 16");
                    if (fillerIdx % 2 == 0) sb.AppendLine("  nop");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  dec %rcx");
                sb.AppendLine("  jne " + funcName + "start");
                sb.AppendLine("  pop %rdx");
                sb.AppendLine("  pop %rcx");
                sb.AppendLine("  pop %r8");
                sb.AppendLine("  pop %r11");
                sb.AppendLine("  pop %r12");
                sb.AppendLine("  pop %r13");
                sb.AppendLine("  pop %r14");
                sb.AppendLine("  pop %r15");
                sb.AppendLine("  pop %rdi");
                sb.AppendLine("  pop %rsi");
                sb.AppendLine("  ret\n\n");
            }
        }

        public void GenerateArmAsm(StringBuilder sb)
        {
            string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
            for (int i = 0; i < Counts.Length; i++)
            {
                string funcName = Prefix + Counts[i];

                sb.AppendLine("\n" + funcName + ":");
                sb.AppendLine("  sub sp, sp, #0x50");
                sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                sb.AppendLine("  mov x15, 1");
                sb.AppendLine("  mov x14, 2");
                sb.AppendLine("  mov x13, 3");
                sb.AppendLine("  mov x12, 4");
                sb.AppendLine("  mov x11, 5");
                sb.AppendLine("  mov x10, 6");

                sb.AppendLine("  mov w25, 0x0");
                sb.AppendLine("  mov w26, 0x40");
                sb.AppendLine("\n" + funcName + "start:");
                sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
                    sb.AppendLine($"  b {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
                for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
                {
                    string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
                    sb.AppendLine($"  b {jumpLabel}");
                    sb.AppendLine($"{jumpLabel}:");
                }

                sb.AppendLine("  sub x0, x0, 1");
                sb.AppendLine("  cbnz x0, " + funcName + "start");
                sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                sb.AppendLine("  add sp, sp, #0x50");
                sb.AppendLine("  ret\n\n");
            }

            if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
        }
    }
}

================================================
FILE: AsmGen/tests/TakenJumpSchedTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class TakenJumpSchedTest : UarchTest
    {
        public TakenJumpSchedTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "takenjumpsched";
            this.Description = "Scheduler, Taken Jumps";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            // if (isa == IUarchTest.ISA.mips64) return true;
            // if (isa == IUarchTest.ISA.riscv) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                for (int i = 0; i < this.Counts.Length; i++)
                {
                    string funcName = this.Prefix + this.Counts[i];
                    sb.AppendLine("\n" + funcName + ":");
                    sb.AppendLine("  push %rsi");
                    sb.AppendLine("  push %rdi");
                    sb.AppendLine("  push %r8");
                    sb.AppendLine("  push %rcx");
                    sb.AppendLine("  push %rdx");

                    // arguments are in RDI, RSI, RDX, RCX, R8, and R9
                    // move them into familiar windows argument regs (rcx, rdx, r8)
                    sb.AppendLine("  mov %rdx, %r8"); // r8 <- rdx
                    sb.AppendLine("  mov %rsi, %rdx"); // rdx <- rsi
                    sb.AppendLine("  mov %rdi, %rcx"); // rcx <- rdi

                    sb.AppendLine("  xor %rdi, %rdi");
                    sb.AppendLine("  mov $0x40, %esi");
                    sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                    sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                    sb.AppendLine("\n" + funcName + "start:");
                    sb.AppendLine("  mov (%rdx,%rdi,4), %edi");
                    for (int fillerIdx = 0;fillerIdx < this.Counts[i]; fillerIdx++)
                    {
                        string labelName = funcName + "part" + fillerIdx;
                        sb.AppendLine("  cmp %rdi, %rsi");
                        sb.AppendLine("  jne " + labelName);
                        sb.AppendLine("  inc %rax");
                        sb.AppendLine(".align 16");
                        sb.AppendLine(labelName + ":");

                    }

                    sb.AppendLine("  mov (%rdx,%rsi,4), %esi");
                    sb.AppendLine("lfence");

                    sb.AppendLine("  dec %rcx");
                    sb.AppendLine("  jne " + funcName + "start");
                    sb.AppendLine("  pop %rdx");
                    sb.AppendLine("  pop %rcx");
                    sb.AppendLine("  pop %r8");
                    sb.AppendLine("  pop %rdi");
                    sb.AppendLine("  pop %rsi");
                    sb.AppendLine("  ret\n\n");
                }
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                for (int i = 0; i < this.Counts.Length; i++)
                {
                    string funcName = this.Prefix + this.Counts[i];

                    // args in x0, x1
                    sb.AppendLine("\n" + funcName + ":");
                    sb.AppendLine("  sub sp, sp, #0x50");
                    sb.AppendLine("  stp x14, x15, [sp, #0x10]");
                    sb.AppendLine("  stp x12, x13, [sp, #0x20]");
                    sb.AppendLine("  stp x10, x11, [sp, #0x30]");
                    sb.AppendLine("  stp x25, x26, [sp, #0x40]");
                    sb.AppendLine("  mov x15, 1");
                    sb.AppendLine("  mov w25, 0x0");
                    sb.AppendLine("  mov w26, 0x40");
                    sb.AppendLine("\n" + funcName + "start:");
                    sb.AppendLine("  ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
                    for (int nopIdx = 0; nopIdx < this.Counts[i]; nopIdx++)
                    {
                        string labelName = funcName + "part" + nopIdx;
                        sb.AppendLine("  cmp w25, w26");
                        sb.AppendLine("  b.ne " + labelName);
                        sb.AppendLine("  add x15, x15, 1");
                        sb.AppendLine("  nop\n  nop\n  nop");
                        sb.AppendLine(labelName + ":");
                    }

                    sb.AppendLine("  ldr w26, [x1, w26, uxtw #2]");
                    sb.AppendLine("  dsb sy");
                    sb.AppendLine("  isb sy");

                    sb.AppendLine("  sub x0, x0, 1");
                    sb.AppendLine("  cbnz x0, " + funcName + "start");
                    sb.AppendLine("  ldp x25, x26, [sp, #0x40]");
                    sb.AppendLine("  ldp x10, x11, [sp, #0x30]");
                    sb.AppendLine("  ldp x12, x13, [sp, #0x20]");
                    sb.AppendLine("  ldp x14, x15, [sp, #0x10]");
                    sb.AppendLine("  add sp, sp, #0x50");
                    sb.AppendLine("  ret\n\n");
                }
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                // todo
                string[] unrolledAdds = new string[4];
                unrolledAdds[0] = "  mul x30, x30, x5";
                unrolledAdds[1] = "  mul x29, x29, x5";
                unrolledAdds[2] = "  mul x28, x28, x5";
                unrolledAdds[3] = "  mul x31, x31, x5";

                string[] unrolledAdds1 = new string[4];
                unrolledAdds1[0] = "  mul x30, x30, x6";
                unrolledAdds1[1] = "  mul x31, x31, x6";
                unrolledAdds1[2] = "  mul x28, x28, x6";
                unrolledAdds1[3] = "  mul x29, x29, x6";
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/Vec512RfTest.cs
================================================
﻿using System.Collections.Generic;
using System.Text;

namespace AsmGen
{
    public class Vec512RfTest : UarchTest
    {
        public Vec512RfTest(int low, int high, int step)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "vec512rf";
            this.Description = "Vector (512-bit packed fp) RF Test - AVX-512 only";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                // it's ok, the ptr chasing arr should be way bigger than this
                string initInstrs = "  vmovups (%r8), %zmm1\n" +
                "  vmovups 64(%r8), %zmm2\n" +
                "  vmovups 128(%r8), %zmm3\n" +
                "  vmovups 192(%r8), %zmm4\n" +
                "  vmovups 256(%r8), %zmm5\n";

                // use all zmm regs
                for (int i = 6; i < 32; i++)
                {
                    initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
                }

                List<string> instrsList = new List<string>();
                for (int i = 1; i < 32; i++)
                {
                    instrsList.Add($"  vaddps %zmm1, %zmm{i}, %zmm{i}");
                }

                string[] unrolledAdds = instrsList.ToArray();
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
            }
        }
    }
}

================================================
FILE: AsmGen/tests/VecMulNsq.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class VecMulNsq : UarchTest
    {
        private int totalOps;
        public VecMulNsq(int low, int high, int step, int totalOps)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "vecmulnsq" + totalOps;
            this.Description = "Vector Integer Multiply, excluding possible NSQ";
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
            this.GetFunctionCallParameters = "structIterations, A, fpArr";
            this.DivideTimeByCount = false;
            this.totalOps = totalOps;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.amd64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string postLoadInstrs = "  mov %rdi, %r15\n  add %r8, %r15\n  movdqu (%r15), %xmm1";
                string initInstrs = "  movdqu (%r8), %xmm2";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  pmulld %xmm1, %xmm0";
                depInstrs[1] = "  pmulld %xmm1, %xmm3";
                depInstrs[2] = "  pmulld %xmm1, %xmm4";
                depInstrs[3] = "  pmulld %xmm1, %xmm5";

                string[] indepInstrs = new string[2];
                indepInstrs[0] = "  pmulld %xmm2, %xmm6";
                indepInstrs[1] = "  pmulld %xmm2, %xmm7";
                UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs1 = "  ldr s16, [x2, w25, uxtw #2]";
                string initInstrs = "  ldr s15, [x2]";
                string[] depInstrs = new string[4];
                depInstrs[0] = "  fadd s0, s0, s16";
                depInstrs[1] = "  fadd s1, s1, s16";
                depInstrs[2] = "  fadd s2, s2, s16";
                depInstrs[3] = "  fadd s3, s3, s16";

                string[] indepInstrs = new string[4];
                indepInstrs[0] = "  fadd s17, s17, s15";
                indepInstrs[1] = "  fadd s18, s18, s15";
                indepInstrs[2] = "  fadd s19, s19, s15";
                indepInstrs[3] = "  fadd s20, s20, s15";
                UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
                    postLoadInstrs: postLoadInstrs1);
            }
        }
    }
}


================================================
FILE: AsmGen/tests/ZeroRobTest.cs
================================================
﻿using System.Text;

namespace AsmGen
{
    public class ZeroRobTest : UarchTest
    {
        private bool initialDependentBranch;
        public ZeroRobTest(int low, int high, int step, bool initialDependentBranch)
        {
            this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
            this.Prefix = "zerorob" + (initialDependentBranch ? "db" : string.Empty);
            this.Description = "Reorder Buffer Test with Zeroing Idioms" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
            this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
            this.GetFunctionCallParameters = "structIterations, A";
            this.DivideTimeByCount = false;
            this.initialDependentBranch = initialDependentBranch;
        }

        public override bool SupportsIsa(IUarchTest.ISA isa)
        {
            if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
            if (isa == IUarchTest.ISA.amd64) return true;
            if (isa == IUarchTest.ISA.aarch64) return true;
            if (isa == IUarchTest.ISA.mips64) return true;
            return false;
        }

        public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
        {
            if (isa == IUarchTest.ISA.amd64)
            {
                string[] nops = new string[] { "  xor %r11, %r11" };
                UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
            }
            else if (isa == IUarchTest.ISA.aarch64)
            {
                string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
                string[] nops = new string[] { "  mov x10, 0" };
                UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
                if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
            }
            else if (isa == IUarchTest.ISA.mips64)
            {
                string[] nops = new string[] { "  move $r14, $r0" };
                UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
            }
            else if (isa == IUarchTest.ISA.riscv)
            {
                string[] nops = new string[] { "  mov $r14, $r0" };
                UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
            }
        }
    }
}


================================================
FILE: CoherencyLatency/CoherencyLatency.cpp
================================================
#include <stdio.h>
#include <stdint.h>

#ifndef __MINGW32__
    #include <sys\timeb.h>
#else
    #include <sys/timeb.h>
#endif

#include <intrin.h>
#include <windows.h>

#define ITERATIONS 10000000;

float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter);
float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter);
DWORD WINAPI LatencyTestThread(LPVOID param);
DWORD WINAPI ReadLatencyTestThread(LPVOID param);

LONG64* bouncyBase;
LONG64* bouncy;

typedef struct LatencyThreadData {
    uint64_t start;       // initial value to write into target
    uint64_t iterations;  // number of iterations to run
    LONG64 *target;       // value to bounce between threads, init with start - 1
    LONG64 *readTarget;   // for read test, memory location to read from (owned by other core)
    DWORD affinityMask;   // thread affinity mask to set
} LatencyData;

int main(int argc, char *argv[]) {
    SYSTEM_INFO sysInfo;
    DWORD numProcs;
    float** latencies;
    uint64_t iter = ITERATIONS;
    int offsets = 1;
    float (*test)(unsigned int, unsigned int, uint64_t) = RunTest;

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "iterations", 10) == 0) {
                argIdx++;
                iter = atoi(argv[argIdx]);
                fprintf(stderr, "%lu iterations requested\n", iter);
            }
            else if (_strnicmp(arg, "bounce", 6) == 0) {
                fprintf(stderr, "Bouncy\n");
            }
            else if (_strnicmp(arg, "owned", 5) == 0) {
                test = RunOwnedTest;
                fprintf(stderr, "Using separate cache lines for each thread to write to\n");
            }
            else if (_strnicmp(arg, "offset", 6) == 0) {
                argIdx++;
                offsets = atoi(argv[argIdx]);
                fprintf(stderr, "Offsets: %d\n", offsets);
            }
        }
    }

    bouncyBase = (LONG64*)_aligned_malloc(64 * offsets, 4096);
    bouncy = bouncyBase;
    if (bouncy == NULL) {
        fprintf(stderr, "Could not allocate aligned mem\n");
    }

    GetSystemInfo(&sysInfo);
    numProcs = sysInfo.dwNumberOfProcessors;
    fprintf(stderr, "Number of CPUs: %u\n", numProcs);
    latencies = (float **)malloc(sizeof(float*) * offsets);
    if (latencies == NULL) {
        fprintf(stderr, "couldn't allocate result array\n");
        return 0;
    }

    for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
        bouncy = (LONG64*)((char*)bouncyBase + offsetIdx * 64);
        latencies[offsetIdx] = (float*)malloc(sizeof(float) * numProcs * numProcs);
        float* latenciesPtr = latencies[offsetIdx];

        // Run all to all, skipping testing a core against itself ofc
        // technically can skip the other way around (start j = i + 1) but meh
        for (DWORD i = 0; i < numProcs; i++) {
            for (DWORD j = 0; j < numProcs; j++) {
                latenciesPtr[j + i * numProcs] = i == j ? 0 : test(i, j, iter);
            }
        }
    }

    for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
        printf("Cache line offset: %d\n", offsetIdx);
        float* latenciesPtr = latencies[offsetIdx];

        // print thing to copy to excel
        for (DWORD i = 0; i < numProcs; i++) {
            for (DWORD j = 0; j < numProcs; j++) {
                if (j != 0) printf(",");
                if (j == i) printf("x");
                else printf("%f", latenciesPtr[j + i * numProcs]);
            }
            printf("\n");
        }

        free(latenciesPtr);
    }

    free(latencies);
    _aligned_free(bouncyBase);
    return 0;
}

float TimeThreads(unsigned int processor1, unsigned int processor2, uint64_t iter, LatencyData lat1, LatencyData lat2, DWORD (*threadFunc)(LPVOID)) {
    struct timeb start, end;
    HANDLE testThreads[2];
    DWORD tid1, tid2;

    testThreads[0] = CreateThread(NULL, 0, threadFunc, &lat1, CREATE_SUSPENDED, &tid1);
    testThreads[1] = CreateThread(NULL, 0, threadFunc, &lat2, CREATE_SUSPENDED, &tid2);

    if (testThreads[0] == NULL || testThreads[1] == NULL) {
        fprintf(stderr, "Failed to create test threads\n");
        return -1;
    }

    SetThreadAffinityMask(testThreads[0], 1ULL << (uint64_t)processor1);
    SetThreadAffinityMask(testThreads[1], 1ULL << (uint64_t)processor2);

    ftime(&start);
    ResumeThread(testThreads[0]);
    ResumeThread(testThreads[1]);
    WaitForMultipleObjects(2, testThreads, TRUE, INFINITE);
    ftime(&end);

    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    float latency = 1e6 * (float)time_diff_ms / (float)iter;

    fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency);

    CloseHandle(testThreads[0]);
    CloseHandle(testThreads[1]);

    // each thread does interlocked compare and exchange iterations times. divide by 2 to get overall count of locked ops
    return latency / 2;
}

/// <summary>
/// Measures latency from one processor core to another
/// </summary>
/// <param name="processor1">processor number 1</param>
/// <param name="processor2">processor number 2</param>
/// <param name="iter">Number of iterations</param>
/// <param name="bouncy">aligned mem to bounce around</param>
/// <returns>latency per iteration in ns</returns>
float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
    LatencyData lat1, lat2;
    float latency;

    *bouncy = 0;
    lat1.iterations = iter;
    lat1.start = 1;
    lat1.target = bouncy;
    lat2.iterations = iter;
    lat2.start = 2;
    lat2.target = bouncy;

    latency = TimeThreads(processor1, processor2, iter, lat1, lat2, LatencyTestThread);
    return latency;
}

float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
    LatencyData lat1, lat2;
    LONG64* target1, * target2;
    float latency;

    // drop them on different cache lines
    target1 = (LONG64*)_aligned_malloc(128, 64);
    target2 = target1 + 8;
    if (target1 == NULL) {
        fprintf(stderr, "Could not allocate aligned mem\n");
    }

    *target1 = 1;
    *target2 = 0;
    lat1.iterations = iter;
    lat1.start = 3;
    lat1.target = target1;
    lat1.readTarget = target2;
    lat2.iterations = iter;
    lat2.start = 2;
    lat2.target = target2;
    lat2.readTarget = target1;

    latency = TimeThreads(processor1, processor2, iter, lat1, lat2, ReadLatencyTestThread);
    _aligned_free(target1);
    return latency;
}

/// <summary>
/// Runs one thread of the latency test. should be run in pairs
/// Always writes to target
/// </summary>
/// <param name="param">Latency test params</param>
/// <returns>next value that would have been written to shared memory</returns>
DWORD WINAPI LatencyTestThread(LPVOID param) {
    LatencyData *latencyData = (LatencyData *)param;
    uint64_t current = latencyData->start;
    while (current <= 2 * latencyData->iterations) {
        if (_InterlockedCompareExchange64(latencyData->target, current, current - 1) == current - 1) {
            current += 2;
        }
    }

    return current;
}

/// <summary>
/// Similar thing but tries to not bounce cache line ownership
/// Instead, threads write to different cache lines
/// </summary>
/// <param name="param">Latency test params</param>
/// <returns>next value that would have been written to owned mem</returns>
DWORD WINAPI ReadLatencyTestThread(LPVOID param) {
    LatencyData* latencyData = (LatencyData*)param;
    uint64_t current = latencyData->start;
    uint64_t startTsc = __rdtsc();
    while (current <= 2 * latencyData->iterations) {
        if (*(latencyData->readTarget) == current - 1) {
            *(latencyData->target) = current;
            current += 2;
            _mm_sfence();
        }
    }

    return current;
}


================================================
FILE: CoherencyLatency/CoherencyLatency.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.31025.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CoherencyLatency", "CoherencyLatency.vcxproj", "{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32
		{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325}
	EndGlobalSection
EndGlobal


================================================
FILE: CoherencyLatency/CoherencyLatency.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{6d9ccc8c-09f5-484b-8630-be18a9cf1995}</ProjectGuid>
    <RootNamespace>CoherencyLatency</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="CoherencyLatency.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>


================================================
FILE: CoherencyLatency/Makefile
================================================
include ../Common/arch_detect.mk

CFLAGS = -pthread -O3

all: $(TARGET)

amd64:
	$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS)

aarch64:
	$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS)

riscv64:
	$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS)

w64:
	$(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS)

# w64 can build with mingw 11, which isn't available on jammy

ci: amd64 aarch64 riscv64

clean:
	rm -rf *.o *.zip "ocl-icd-libopencl1*" "OpenCL-SDK*" && find . -type f -executable -delete

.PHONY: all ci clean


================================================
FILE: CoherencyLatency/PThreadsCoherencyLatency.c
================================================
#define _GNU_SOURCE

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>

#define ITERATIONS 10000000;

// kidding right?
#define gettid() syscall(SYS_gettid)

typedef struct LatencyThreadData {
    uint64_t start;
    uint64_t iterations;
    volatile uint64_t *target;
    unsigned int processorIndex;
} LatencyData;

typedef struct LatencyPairRunData {
    uint32_t processor1;
    uint32_t processor2;
    uint64_t iter;
    float result;
    uint64_t *target;
} LatencyPairRunData;

void *LatencyTestThread(void *param);
void *NoLockLatencyTestThread(void *param);
void *(*testFunc)(void *) = LatencyTestThread;
void *RunTest(void *param);

int main(int argc, char *argv[]) {
    float **latencies;
    int *parallelTestState;
    int numProcs, offsets = 1, parallelismFactor = 1;
    uint64_t iter = ITERATIONS;
    uint64_t *bouncyArr;

    numProcs = get_nprocs();
    fprintf(stderr, "Number of CPUs: %u\n", numProcs);

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (strncmp(arg, "iterations", 10) == 0) {
                argIdx++;
                iter = atoi(argv[argIdx]);
                fprintf(stderr, "%lu iterations requested\n", iter);
            }
            else if (strncmp(arg, "nolock", 6) == 0) {
                fprintf(stderr, "No locks, plain loads and stores\n");
                testFunc = NoLockLatencyTestThread;
            }
            else if (strncmp(arg, "offset", 6) == 0) {
                argIdx++;
                offsets = atoi(argv[argIdx]);
                fprintf(stderr, "Offsets: %d\n", offsets);
            }
            else if (strncmp(arg, "parallel", 8) == 0) {
                argIdx++;
                parallelismFactor = atoi(argv[argIdx]);
                fprintf(stderr, "Will go for %d runs in parallel\n", parallelismFactor);
            }
        }
    }

    latencies = (float **)malloc(sizeof(float *) * offsets);
    parallelTestState = (int *)malloc(sizeof(int) * numProcs * numProcs);
    memset(latencies, 0, sizeof(float) * offsets);
    if (0 != posix_memalign((void **)(&bouncyArr), 4096, 4096 * parallelismFactor)) {
        fprintf(stderr, "Could not allocate aligned mem\n");
        return 0;
    } 

    LatencyPairRunData *pairRunData = (LatencyPairRunData *)malloc(sizeof(LatencyPairRunData) * parallelismFactor);

    for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
        latencies[offsetIdx] = (float *)malloc(sizeof(float) * numProcs * numProcs);
        memset(parallelTestState, 0, sizeof(int) * numProcs * numProcs);
        float *latenciesPtr = latencies[offsetIdx];

        while (1) {
            // select parallelismFactor threads
            int selectedParallelTestCount = 0;
            memset(pairRunData, 0, sizeof(LatencyPairRunData) * parallelismFactor);
            for (int i = 0;i < numProcs && selectedParallelTestCount < parallelismFactor; i++) {
                for (int j = 0;j < numProcs && selectedParallelTestCount < parallelismFactor; j++) {
                    if (j == i) { latenciesPtr[j + i * numProcs] = 0; continue; }
                    if (parallelTestState[j + i * numProcs] == 1) {
                        fprintf(stderr, "Thread unexpectedly did not complete\n");
                        exit(0);
                    }
                    if (parallelTestState[j + i * numProcs] == 0) {
                        // neither thread can already have a pending run
                        int validPair = 1;
                        for (int c = 0; c < numProcs; c++) {
                            if (parallelTestState[j + c * numProcs] == 1 || 
                                parallelTestState[c + i * numProcs] == 1 ||
                                parallelTestState[i + c * numProcs] == 1 ||
                                parallelTestState[c + j * numProcs] == 1) {
                                validPair = 0;
                                break;
                            }
                        }

                        if (!validPair) continue;

                        // for SMT enabled CPUs, check sibling threads. will do later
                        parallelTestState[j + i * numProcs] = 1;
                        pairRunData[selectedParallelTestCount].processor1 = i;
                        pairRunData[selectedParallelTestCount].processor2 = j;
                        pairRunData[selectedParallelTestCount].iter = iter;
                        pairRunData[selectedParallelTestCount].result = 0.0f;
                        pairRunData[selectedParallelTestCount].target = bouncyArr + (512 * selectedParallelTestCount + 8 * offsetIdx);
                        fprintf(stderr, "Selected %d -> %d\n", i, j);
                        selectedParallelTestCount++;
                    }
                }
            }
            
            if (selectedParallelTestCount == 0) break;

            // launch threads
            fprintf(stderr, "Selected %d pairs for parallel testing\n", selectedParallelTestCount);
            pthread_t *testThreads = (pthread_t *)malloc(selectedParallelTestCount * sizeof(pthread_t));
            memset(testThreads, 0, selectedParallelTestCount * sizeof(pthread_t));
            for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {
                if (pairRunData[parallelIdx].processor1 == 0 && pairRunData[parallelIdx].processor2 == 0) break;
                pthread_create(testThreads + parallelIdx, NULL, RunTest, (void *)(pairRunData + parallelIdx));
            }

            // join threads
            for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {
                pthread_join(testThreads[parallelIdx], NULL);
                int i = pairRunData[parallelIdx].processor1;
                int j = pairRunData[parallelIdx].processor2;
                latenciesPtr[j + i * numProcs] = pairRunData[parallelIdx].result;
                parallelTestState[j + i * numProcs] = 2;
            }

            free(testThreads);
        }
    }

      for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
        float *latenciesPtr = latencies[offsetIdx];
        printf("Cache line offset: %d\n", offsetIdx);
        for (int i = 0;i < numProcs; i++) {
            for (int j = 0;j < numProcs; j++) {
                if (j != 0) printf(",");
                if (j == i) printf("x");
                // to maintain consistency, divide by 2 (see justification in windows version)
                else printf("%f", latenciesPtr[j + i * numProcs] / 2);
            }
            printf("\n");
        }

        free(latenciesPtr);
    }

    free(parallelTestState);
    free(pairRunData);
    free(latencies);
    free(bouncyArr);
    return 0;
}

// run test and gather timing data using the specified thread function
float TimeThreads(unsigned int proc1,
                  unsigned int proc2,
                  uint64_t iter,
                  LatencyData *lat1,
                  LatencyData *lat2,
                  void *(*threadFunc)(void *)) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    pthread_t testThreads[2];
    int t1rc, t2rc;
    void *res1, *res2;

    gettimeofday(&startTv, &startTz);
    t1rc = pthread_create(&testThreads[0], NULL, threadFunc, (void *)lat1);
    t2rc = pthread_create(&testThreads[1], NULL, threadFunc, (void *)lat2);
    if (t1rc != 0 || t2rc != 0) {
      fprintf(stderr, "Could not create threads\n");
      return 0;
    }

    pthread_join(testThreads[0], &res1);
    pthread_join(testThreads[1], &res2);
    gettimeofday(&endTv, &endTz);

    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)iter;
    return latency;
}

// test latency between two logical CPUs
// float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
void *RunTest(void *param) {
  LatencyPairRunData *pairRunData = (LatencyPairRunData *)param;
  uint32_t processor1 = pairRunData->processor1;
  uint32_t processor2 = pairRunData->processor2;
  uint64_t iter = pairRunData->iter;
  LatencyData lat1, lat2;
  float latency;

  *(pairRunData->target) = 0;
  lat1.iterations = iter;
  lat1.start = 1;
  lat1.target = pairRunData->target;
  lat1.processorIndex = processor1;
  lat2.iterations = iter;
  lat2.start = 2;
  lat2.target = pairRunData->target;
  lat2.processorIndex = processor2;
  latency = TimeThreads(processor1, processor2, iter, &lat1, &lat2, NoLockLatencyTestThread);
  fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency);
  pairRunData->result = latency;
  return NULL;
}

void *LatencyTestThread(void *param) {
    LatencyData *latencyData = (LatencyData *)param;
    cpu_set_t cpuset;
    uint64_t current = latencyData->start;

    CPU_ZERO(&cpuset);
    CPU_SET(latencyData->processorIndex, &cpuset);
    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
    //fprintf(stderr, "thread %ld set affinity %d\n", gettid(), latencyData->processorIndex);

    while (current <= 2 * latencyData->iterations) {
        if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) current += 2;
    }

    pthread_exit(NULL);
}

void *NoLockLatencyTestThread(void *param) {
    LatencyData *latencyData = (LatencyData *)param;
    cpu_set_t cpuset;
    uint64_t current = latencyData->start;

    CPU_ZERO(&cpuset);
    CPU_SET(latencyData->processorIndex, &cpuset);
    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);

    while (current <= 2 * latencyData->iterations) {
        if (*(latencyData->target) == current - 1) {
            *(latencyData->target) = current;
            current += 2;
        } 
    }

    pthread_exit(NULL);
} 


================================================
FILE: CoherencyLatency/c2cparse/Program.cs
================================================
﻿// See https://aka.ms/new-console-template for more information
using System;

public class C2CParse
{
    public static void Main(string[] args)
    {
        if (args.Length == 0)
        {
            Console.WriteLine("Need filename as arg");
            return;
        }

        string[] inputLatencies = null;
        string[] outputLatencies = null;
        string inputFile = File.ReadAllText(args[0]);
        string[] inputLines = inputFile.Split('\n');
        for (int row = 0; row < inputLines.Length; row++)
        {
            string[] lineSplit = inputLines[row].Split(',');
            if (inputLatencies == null)
            {
                inputLatencies = new string[inputLines.Length * lineSplit.Length];
                outputLatencies = new string[inputLines.Length * lineSplit.Length];
                if (inputLines.Length != lineSplit.Length)
                {
                    Console.WriteLine("Line count: {0}, line segments: {1} must be equal", inputLines.Length, lineSplit.Length);
                    return;
                }
            }

            for (int i = 0; i < inputLines.Length; i++)
            {
                inputLatencies[row * lineSplit.Length + i] = lineSplit[i];
            }
        }

        for (int row = 0; row < inputLines.Length; row++)
        {
            for (int col = 0; col < inputLines.Length; col++)
            {
                string v1 = inputLatencies[row * inputLines.Length + col];
                // translate both row and col
                int newRow = GetCoreIndex(row, 4, 64);
                int newCol = GetCoreIndex(col, 4, 64);
                outputLatencies[newRow * inputLines.Length + newCol] = v1;
            }
        }

        for (int row = 0; row < inputLines.Length; row++)
        {
            for (int col = 0; col < inputLines.Length; col++)
            {
                Console.Write(",{0}", outputLatencies[row * inputLines.Length + col]);
            }

            Console.WriteLine();
        }
    }

    /// <summary>
    /// Convert linux index to windows index
    /// </summary>
    /// <param name="inputIndex"></param>
    /// <param name="smtCount"></param>
    /// <param name="coreCount"></param>
    /// <returns></returns>
    public static int GetCoreIndex(int inputIndex, int smtCount, int coreCount)
    {
        int physicalCoreIndex = inputIndex % coreCount;
        int smtIndex = inputIndex / coreCount;
        return physicalCoreIndex * smtCount + smtIndex;
    }
}

================================================
FILE: CoherencyLatency/c2cparse/c2cparse.csproj
================================================
<Project Sdk="Microsoft.NET.Sdk">

  <PropertyGroup>
    <OutputType>Exe</OutputType>
    <TargetFramework>net6.0</TargetFramework>
    <ImplicitUsings>enable</ImplicitUsings>
    <Nullable>enable</Nullable>
  </PropertyGroup>

</Project>


================================================
FILE: CoherencyLatency/c2cparse/c2cparse.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.4.33110.190
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "c2cparse", "c2cparse.csproj", "{F9E172EC-1A9A-4908-9512-4547CD1CFD80}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|Any CPU = Debug|Any CPU
		Release|Any CPU = Release|Any CPU
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
		{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU
		{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU
		{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594}
	EndGlobalSection
EndGlobal


================================================
FILE: Common/arch_detect.mk
================================================
TARGET ?= amd64

ifeq ($(OS),Windows_NT)
    TARGET = w64
else
    UNAME_M := $(shell uname -m)
    ifeq ($(UNAME_M),x86_64)
        TARGET = amd64
    endif
    ifeq ($(UNAME_M),aarch64)
        TARGET = aarch64
    endif
    ifeq ($(UNAME_M),riscv64)
        TARGET = riscv64
    endif
    UNAME_S := $(shell uname -s)
    ifeq ($(UNAME_S),Darwin)
    TARGET = darwin
    endif
endif

amd64: CC = x86_64-linux-gnu-gcc
amd64_numa: CC = x86_64-linux-gnu-gcc
aarch64: CC := gcc
aarch64_numa: CC = aarch64-linux-gnu-gcc
riscv64: CC = riscv64-linux-gnu-gcc
w64: CC = x86_64-w64-mingw32-gcc
darwin: CC = clang


================================================
FILE: Common/ci_gpumemlatency.sh
================================================
#!/bin/sh

make_all () {
	make amd64
	make clean-obj
	LDFLAGS="-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL" make aarch64
	make clean-obj
	LDFLAGS="-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL" make riscv64
	make clean-obj
	CPPFLAGS="-I OpenCL-SDK-${OCL_VER}-Win-x64/include" LDFLAGS="-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL" make w64
	make clean-obj
}

linux_deps () {
	for ARCH in arm64 riscv64; do
	if ! grep -q $ARCH /etc/apt/sources.list; then
		echo "deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
		echo "deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
		sudo apt update
	fi
		apt-get download "ocl-icd-libopencl1:${ARCH}"
		find . -type f -name "*${ARCH}*.deb" -exec dpkg-deb -x {} "ocl-icd-${ARCH}" \;
done
cp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so
cp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so
}

w64_deps () {
	curl -fssLO "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip"
	unzip "OpenCL-SDK-${OCL_VER}-Win-x64.zip"
}

linux_deps
w64_deps
make_all


================================================
FILE: Common/ci_package.sh
================================================
#!/bin/sh

PKG="clammarks-$(git rev-parse --short HEAD)"
rm -rf "$PKG" "clammarks.txz"
mkdir -p "$PKG"

for TARGET in "amd64" "aarch64" "riscv64" "w64"; do
	mkdir "$PKG/$TARGET"
	for COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do
		find "$COMPONENT" -type f -name "*$TARGET*" -executable -exec cp {} "$PKG/$TARGET" \;
	done
	find "GpuMemLatency" -type f -name "*.cl" -exec cp {} "$PKG/$TARGET" \;
done

cp "LICENSE" "$PKG"

tar caf "clammarks.txz" "$PKG"


================================================
FILE: Common/perfmon.h
================================================
// Stuff that only works on Linux. Should be #ifdef-ed out for mingw cross compilation
uint64_t readmsr(uint32_t coreindex, uint32_t msrindex) {
    char buf[256];
    memset(buf, 0, 256);
    snprintf(buf, 256, "/dev/cpu/%d/msr", coreindex);
    int fd;
    uint64_t msrvalue = 0;
    fd = open(buf, O_RDWR);
    if (fd == -1) {
        fprintf(stderr, "Could not open msr\n");
        return 0;
    }
    
    lseek(fd, msrindex, SEEK_SET);
    read(fd, &msrvalue, 8);
    close(fd);
    return msrvalue;
}

#define PERF_NUM_EVENTS 4
struct perf_read_data {
    uint64_t nr;
    struct {
        uint64_t value;
        uint64_t id;
    } values[PERF_NUM_EVENTS];
};

struct perf_select_data {
    uint64_t id;   // id used to identify the event when it comes back in a group
    int fd;        // file descriptor
    struct perf_event_attr attr;
    uint64_t value;
    const char *description;
};

struct perf_select_data perf_selected_events[PERF_NUM_EVENTS];
struct perf_read_data perfReadData;
struct timeval perf_startTv, perf_endTv;
uint64_t perf_time_ms;

// populates basic properties
void initialize_hw_event(struct perf_event_attr *attr, uint64_t cfg, uint32_t hwid) {
    memset(attr, 0, sizeof(struct perf_event_attr));
    
    // low 32 bits of config = hardware event id
    // high 32 bits = PMU id (atom/core). Get from /sys/devices/<the thing>/type
    // on Arrow Lake, atom = 10, core = 4 
    attr->config = cfg | ((uint64_t)hwid << 32);
    attr->type = PERF_TYPE_HARDWARE;
    attr->size = sizeof(struct perf_event_attr);
    attr->disabled = 1;
    attr->exclude_kernel = 1;
    attr->exclude_hv = 1;
    attr->inherit = 1; // include child threads
    attr->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
}

void set_hw_event(struct perf_select_data *evt, int groupfd) {
    evt->fd = syscall(__NR_perf_event_open, &(evt->attr), 0, -1, groupfd, 0);
    ioctl(evt->fd, PERF_EVENT_IOC_ID, &(evt->id));
}

void open_perf_monitoring() {
    int groupLeaderFd = -1;
    memset(perf_selected_events, 0, sizeof(struct perf_select_data) * PERF_NUM_EVENTS);
    
    perf_selected_events[0].description = "instructions";
    initialize_hw_event(&(perf_selected_events[0].attr), PERF_COUNT_HW_INSTRUCTIONS, 0);
    set_hw_event(perf_selected_events, -1);
    groupLeaderFd = perf_selected_events[0].fd;

    perf_selected_events[1].description = "cycles";
    initialize_hw_event(&(perf_selected_events[1].attr), PERF_COUNT_HW_CPU_CYCLES, 0);
    set_hw_event(perf_selected_events + 1, groupLeaderFd);

    perf_selected_events[2].description = "llc_ref";
    initialize_hw_event(&(perf_selected_events[2].attr), 0x4F2E, 0);
    perf_selected_events[2].attr.type = PERF_TYPE_RAW;
    set_hw_event(perf_selected_events + 2, groupLeaderFd);

    perf_selected_events[3].description = "llc_miss";
    initialize_hw_event(&(perf_selected_events[3].attr), 0x412E, 0);
    perf_selected_events[3].attr.type = PERF_TYPE_RAW;
    set_hw_event(perf_selected_events + 3, groupLeaderFd);
}

void start_perf_monitoring() {
    gettimeofday(&perf_startTv, NULL);
    int groupLeaderFd = perf_selected_events[0].fd;
    ioctl(groupLeaderFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
    ioctl(groupLeaderFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); 
}

uint64_t instrs, cycles, llcRef, llcMiss;
void stop_perf_monitoring() {
    int readbytes = 0;
    int groupLeaderFd = perf_selected_events[0].fd;
    ioctl(groupLeaderFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
    // fprintf(stderr, "read %d bytes\n", sizeof(struct perf_read_data));
    readbytes = read(groupLeaderFd, &perfReadData, sizeof(struct perf_read_data));
    //fprintf(stderr, "Read %d bytes into perf_read_data. nr = %lu\n", readbytes, perfReadData.nr);
    for (int i = 0; i < perfReadData.nr; i++) {
        for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
            if (perf_selected_events[evt_idx].id == perfReadData.values[i].id) {
                struct perf_select_data *selected_evt = perf_selected_events + evt_idx;
                selected_evt->value = perfReadData.values[i].value;
                // fprintf(stderr, "%s: %lu\n", selected_evt->description, selected_evt->value);
            }
        }
    }

    gettimeofday(&perf_endTv, NULL);
    perf_time_ms = ((perf_endTv.tv_sec - perf_startTv.tv_sec) * 1000 + (perf_endTv.tv_usec - perf_startTv.tv_usec) / 1000); 
}

void close_perf_monitoring() {
    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) close(perf_selected_events[evt_idx].fd);
}

void append_perf_header() {
    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
        printf(",%s", perf_selected_events[evt_idx].description);
    }

    printf(",Time (ms)");
}

void append_perf_values() {
    for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
        printf(",%lu", perf_selected_events[evt_idx].value);
    }
    
    printf(",%lu", perf_time_ms);
}


================================================
FILE: Common/timing.c
================================================
#ifdef _MSC_VER
#include <sys\timeb.h>
__declspec(selectany) struct timeb start, end;
void start_timing() {
    ftime(&start);
}

unsigned int end_timing() {
    ftime(&end);
    return 1000 * (end.time - start.time) + (end.millitm - start.millitm);
}

void start_timing_ts(struct timeb *startTimeb) {
    ftime(startTimeb);
}

unsigned int end_timing_ts(struct timeb* startTimeb) {
    struct timeb end;
    ftime(&end);
    return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm);
}
#else
#include <sys/time.h>
#include <stddef.h>
struct timeval startTv, endTv;
void start_timing() {
    gettimeofday(&startTv, NULL);
}

unsigned int end_timing() {
    gettimeofday(&endTv, NULL);
    return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);
}

void start_timing_ts(struct timeval* start) {
    gettimeofday(start, NULL);
}

unsigned int end_timing_ts(struct timeval* start) {
    struct timeval end;
    gettimeofday(&end, NULL);
    return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000);

}
#endif

unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) {
  // safety measure to deal with nasty timer precision issues if the system is fast
  if (last_time < 50) return last_iteration_count * 2;
  return last_iteration_count * (target_time / last_time);
}


================================================
FILE: Common/timing.h
================================================
#ifndef timingincluded
#define timingincluded
#ifdef _MSC_VER
#include <sys\timeb.h>
#else
#include <sys/time.h>
#endif
extern struct timeb start, end;
inline void start_timing();
inline unsigned int end_timing();

#ifdef _MSC_VER
void start_timing_ts(struct timeb* startTimeb);
unsigned int end_timing_ts(struct timeb* startTimeb);
#else
void start_timing_ts(struct timeval* start);
unsigned int end_timing_ts(struct timeval* start);
#endif
unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time);
#endif


================================================
FILE: CoreClockChecker/BoostClockChecker.c
================================================
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <stdint.h>
#include <stdlib.h> 
#include <string.h>
#include <unistd.h>

extern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi));

int main(int argc, char *argv[]) {
    struct timeval startTv, endTv;
    uint64_t iterations = 500000, samples = 100;
    unsigned int sleepSeconds = 5;
    time_t time_diff_ms;

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char *arg = argv[argIdx] + 1;
	    if (strncmp(arg, "samples", 7) == 0) {
	        argIdx++;
		samples = atol(argv[argIdx]);
	    } else if (strncmp(arg, "iterations", 10) == 0) {
	        argIdx++;
		iterations = atol(argv[argIdx]);
            } else if (strncmp(arg, "sleep", 5) == 0) {
	        argIdx++;
		sleepSeconds = atoi(argv[argIdx]);
	    }
	}
    }

    sleep(sleepSeconds);

    uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t));
    for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
        uint64_t elapsedTsc = clktsctest(iterations);
	measuredTscs[sampleIdx] = elapsedTsc;
    }

    fprintf(stderr, "Used %lu samples\n", samples);
    fprintf(stderr, "Used %lu iterations\n", iterations);
    // figure out TSC to real time ratio
    fprintf(stderr, "Checking TSC ratio...\n");
    uint64_t iterationsHi = 8e9; // should be a couple seconds at least?
    gettimeofday(&startTv, NULL);
    uint64_t referenceElapsedTsc = clktsctest(iterationsHi);
    gettimeofday(&endTv, NULL);
    time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms;
    float tsc_per_ns = tsc_per_ms / 1e6;
    fprintf(stderr, "TSC = %lu, elapsed ms = %lu\n", referenceElapsedTsc, time_diff_ms);
    fprintf(stderr, "TSC per ms: %f, TSC per ns: %f\n", tsc_per_ms, tsc_per_ns);

    printf("Time (ms), Clk (GHz), TSC\n");
    float elapsedTime = 0;
    for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
	// (tsc / ms) * tsc = 1 / ms
	float elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms;
	elapsedTime += elapsedTimeMs;
	float latency = 1e6 * elapsedTimeMs / (float)iterations;
	float addsPerNs = 1 / latency;
	printf("%f,%f,%lu\n", elapsedTime, addsPerNs, measuredTscs[sampleIdx]);
    }

    return 0;
}


================================================
FILE: CoreClockChecker/BoostClockChecker_arm.s
================================================
.text
.global clktsctest

.global _clktsctest

.balign 4

/* x0 = iterations, return elapsed TSC in x0 */
_clktsctest:
clktsctest:
  sub sp, sp, #0x40
  stp x10, x11, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x14, x15, [sp, #0x30]
  mov x10, 1
  mov x11, 20
  mov x12, 0
  /* stackoverflow says this is a good idea */
  mrs x14, cntvct_el0
clktsctest_loop:
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  add x12, x12, x10
  sub x0, x0, x11
  cbnz x0, clktsctest_loop
  mrs x15, cntvct_el0
  sub x0, x15, x14
  ldp x14, x15, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x10, x11, [sp, #0x10]
  add sp, sp, #0x40
  ret


================================================
FILE: CoreClockChecker/BoostClockChecker_x86.s
================================================
.global clktsctest

/* rcx = iterations, return elapsed TSC in rax */ 
clktsctest:
  push %rdx
  push %rbx
  push %r8
  push %r9
  push %r10
  mov %rcx, %rdi
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  rdtsc            /* high 32 bits in EDX, low 32 bits in EAX */
  shl $32, %rdx    /* shift high 32 bits into upper half of EDX */
  add %rax, %rdx   /* place full 64-bit value in rdx */
  mov %rdx, %r10
clktsctest_loop:
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  sub %r9, %rdi
  jnz clktsctest_loop
  rdtsc
  shl $32, %rdx
  add %rdx, %rax /* now rax has the new value */
  sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */
  pop %r10
  pop %r9
  pop %r8
  pop %rbx
  pop %rdx
  ret  


================================================
FILE: CoreClockChecker/CoreClockChecker.c
================================================
#define _GNU_SOURCE
#include <cpuid.h>
#include <stdio.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <unistd.h>
#include <fcntl.h>
#include <math.h>

#define MSR_RAPL_PWR_UNIT 0xC0010299
#define HWCR 0xC0010015
#define MSR_CORE_ENERGY_STAT 0xC001029A
#define MSR_PKG_ENERGY_STAT 0xC001029B

#define INTEL_MSR_RAPL_PWR_UNIT 0x606
#define INTEL_MSR_PP0_ENERGY_STATUS 0x639
#define INTEL_MSR_PKG_ENERGY_STATUS 0x611

extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));

void detectCpuMaker();
void setBoost(int on);
void setAffinity(int core);
int openMsr(int core);
uint64_t readMsr(int fd, uint32_t addr);
void writeMsr(int fd, uint32_t addr, uint64_t value);
float getEnergyStatusUnits();
uint64_t getCoreEnergyStat(int core);
uint64_t getPkgEnergyStat(int core);
uint64_t getTotalCoreEnergy();
int *msrFds;
int amdCpu = 1;
int numProcs = 0;

int main(int argc, char *argv[]) {
    struct timeval startTv, endTv;
    time_t time_diff_ms;
    float latency, clockSpeedGhz, energyUnits;
    uint64_t startEnergy, endEnergy, startPkgEnergy, endPkgEnergy;
    uint64_t iterationsHigh = 8e9;

    detectCpuMaker();
    numProcs = get_nprocs();
    fprintf(stderr, "Number of CPUs: %u\n", numProcs);
    msrFds = (int *)malloc(sizeof(int) * numProcs);
    memset(msrFds, 0, sizeof(int) * numProcs);

    if (argc > 1 && strncmp(argv[1], "disableboost", 12) == 0) {
        setBoost(0);
    } else if (argc > 1 && strncmp(argv[1], "enableboost", 11) == 0) {
        setBoost(1);
    } else if (argc > 1 && strncmp(argv[1], "power", 5) == 0) {
        iterationsHigh *= 2; // try for more accuracy
	energyUnits = getEnergyStatusUnits();
	printf("Core, Core Power, Package Power\n");
        for (int i = 0; i < numProcs; i++) {
            setAffinity(i);

            gettimeofday(&startTv, NULL);
            startEnergy = getCoreEnergyStat(i);
            startPkgEnergy = getPkgEnergyStat(i);
            clktest(iterationsHigh);
            endPkgEnergy = getPkgEnergyStat(i);
	    endEnergy = getCoreEnergyStat(i);
            gettimeofday(&endTv, NULL);

            time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
            latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
            clockSpeedGhz = 1 / latency;
            //printf("runtime: %llu ms\n", time_diff_ms);
            //printf("%d, %f GHz\n", i, clockSpeedGhz);
	    printf("%d, %f, %f\n", i,
	        ((endEnergy - startEnergy) * energyUnits) / (time_diff_ms / 1000),
	        ((endPkgEnergy - startPkgEnergy) * energyUnits) / (time_diff_ms / 1000));
        }
    } else if (argc > 2 && strncmp(argv[1], "measurecmd", 9) == 0) {
        int rc;
	float coreJoules, pkgJoules;
        fprintf(stderr, "argv[2] is %s\nOnly handling Intel at the moment\n", argv[2]);
	energyUnits = getEnergyStatusUnits();

	gettimeofday(&startTv, NULL);
	startEnergy = getTotalCoreEnergy();
	startPkgEnergy = getPkgEnergyStat(0);
	rc = system(argv[2]);
	endEnergy = getTotalCoreEnergy();
	endPkgEnergy = getPkgEnergyStat(0);
	gettimeofday(&endTv, NULL);
	fprintf(stderr, "system() returned %d\n", rc);

        time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
	coreJoules = (endEnergy - startEnergy) * energyUnits;
	pkgJoules = (endPkgEnergy - startPkgEnergy) * energyUnits;
	printf("Core Joules: %f\n", coreJoules);
	printf("Package Joules: %f\n", pkgJoules);
	printf("Elapsed time, seconds: %f\n", (double)time_diff_ms / 1000);
    }
    else {
        for (int i = 0; i < numProcs; i++) {
            setAffinity(i);

            gettimeofday(&startTv, NULL);
            clktest(iterationsHigh);
            gettimeofday(&endTv, NULL);
            time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
            latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
            clockSpeedGhz = 1 / latency;
            //printf("runtime: %llu ms\n", time_diff_ms);
            printf("%d, %f GHz\n", i, clockSpeedGhz);
        }
    }

    free(msrFds);
    return 0;
}

void detectCpuMaker() {
    uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;
    uint32_t *uintPtr;
    char cpuName[13];
    amdCpu = 0;
    __cpuid_count(0, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);
    uintPtr = (uint32_t *)cpuName;
    uintPtr[0] = cpuidEbx;
    uintPtr[1] = cpuidEdx;
    uintPtr[2] = cpuidEcx;
    cpuName[12] = 0;
    fprintf(stderr, "CPU name: %s\n", cpuName);
    if (memcmp(cpuName, "GenuineIntel", 12) == 0) {
        amdCpu = 0;
	fprintf(stderr, "Looks like Intel\n");
    } else if (memcmp(cpuName, "AuthenticAMD", 12) == 0) {
        amdCpu = 1;
	fprintf(stderr, "Looks like AMD\n");
    }
}

void setAffinity(int core) {
    int rc;
    cpu_set_t cpuset;
    pthread_t thread = pthread_self();
    CPU_ZERO(&cpuset);
    CPU_SET(core, &cpuset);
    rc = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
    if (rc != 0) {
        fprintf(stderr, "unable to set thread affinity to %d\n", core);
    }
}

int openMsr(int core) {
    char msrFilename[255];
    int fd;
    sprintf(msrFilename, "/dev/cpu/%d/msr", core);
    fd = open(msrFilename, O_RDWR);
    if (fd < 0) {
        fprintf(stderr, "Could not open MSR file, core %d\n", core);
        return -1;
    }
    return fd;
}

uint64_t readMsr(int fd, uint32_t addr) {
    uint64_t result, bytesRead;
    bytesRead = pread(fd, &result, sizeof(result), addr);
    if (bytesRead != sizeof(result)) {
        fprintf(stderr, "Could not read from fd %d, msr %u\n", fd, addr);
    }
    return result;
}

void writeMsr(int fd, uint32_t addr, uint64_t value) {
    uint64_t bytesWritten, newValue;
    bytesWritten = pwrite(fd, &value, sizeof(value), addr);
    if (bytesWritten != sizeof(value)) {
        fprintf(stderr, "Could not write to fd %d, msr %u, value %lu\n", fd, addr, value);
    }

    newValue = readMsr(fd, addr);
    if (value != newValue) {
        fprintf(stderr, "Wrote to fd %d, msr %u, value %lu, but write did not take effect\n", fd, addr, value);
    }
}

void setBoost(int on) {
    uint64_t hwcrValue;
    for (int i = 0; i < numProcs; i++) {
        setAffinity(i);
	if (!msrFds[i]) msrFds[i] = openMsr(i);
	hwcrValue = readMsr(msrFds[i], HWCR);
        if (on) {
	    hwcrValue &= ~(1UL << 25);  // unset bit to request CPB on
	    //fprintf(stderr, "Requesting CPB on (unsetting bit 25 in HWCR): 0x%08x\n", hwcrValue);
	} else {
	    hwcrValue |= (1UL << 25);      // set bit to disable CPB
	    //fprintf(stderr, "Requesting CPB off (setting bit 25 in HWCR): 0x%08x\n", hwcrValue);
	}

	writeMsr(msrFds[i], HWCR, hwcrValue);
    }
}

float getEnergyStatusUnits() {
    uint64_t energyUnits, raplPwrUnit;
    setAffinity(0);
    if (!msrFds[0]) msrFds[0] = openMsr(0);

    if (amdCpu) {
        raplPwrUnit = readMsr(msrFds[0], MSR_RAPL_PWR_UNIT);
    }
    else
    {
        raplPwrUnit = readMsr(msrFds[0], INTEL_MSR_RAPL_PWR_UNIT);
    }

    energyUnits = (raplPwrUnit >> 8) & 0x1F;
    return (float)pow(0.5, (double)energyUnits);
}

uint64_t getCoreEnergyStat(int core) {
    if (!msrFds[core]) msrFds[core] = openMsr(core);

    if (amdCpu)
        return readMsr(msrFds[core], MSR_CORE_ENERGY_STAT);
    else
        return readMsr(msrFds[core], INTEL_MSR_PP0_ENERGY_STATUS);
}

uint64_t getPkgEnergyStat(int core) {
    if (!msrFds[core]) msrFds[core] = openMsr(core);
    if (amdCpu)
        return readMsr(msrFds[core], MSR_PKG_ENERGY_STAT);
    else
        return readMsr(msrFds[core], INTEL_MSR_PKG_ENERGY_STATUS);
}

uint64_t getTotalCoreEnergy() {
    if (amdCpu) {
        uint64_t totalCoreEnergy = 0;

	// only testing the 5950X and 3950X for now, and physical cores
	// are 0-15 on linux. hack around this until I have time to
	// programatically figure out SMT siblings
        for (int i = 0; i < 16; i++) {
            totalCoreEnergy += getCoreEnergyStat(i);
        }

	return totalCoreEnergy;
    } else {
        // intel does not track power per core
        return getCoreEnergyStat(0);
    }
}


================================================
FILE: CoreClockChecker/CoreClockChecker_x86.s
================================================
.global clktest

/*
  %rdi = arg0 = iteration count
*/
clktest:
  push %rbx
  push %r8
  push %r9
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
clktest_loop:
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  sub %r9, %rdi
  jnz clktest_loop
  pop %r9
  pop %r8
  pop %rbx
  ret


================================================
FILE: CoreClockChecker/Makefile
================================================
include ../Common/arch_detect.mk

CFLAGS = -O3
LDFLAGS = -lm

all: $(TARGET)

amd64:
	$(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS)
	$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS)

aarch64:
	$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS)

w64:
	$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS)

ci: amd64 aarch64 w64

clean:
	rm -f *.o && find . -type f -executable -delete

.PHONY: all ci clean


================================================
FILE: CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm
================================================
section .text
bits 64

global clktest

; rcx = iteration count
; rdx = address of memory location to monitor
; return elapsed tsc
clktest:
  push rdx
  push rbx
  push r8
  push r9
  push r10
  push r11
  xor rbx, rbx
  mov r8, 1 ; GLC will eliminate adds with immediates or increments
clktest_loop:
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  add rbx, r8
  mov r11d, [rdx]
  test r11d, r11d
  jnz clktest_loop_end ; early exit condition (someone else exited)
  sub rcx, 20
  jg clktest_loop
  mov [rdx], r8
clktest_loop_end:
  mov rax, rbx
  pop r11
  pop r10
  pop r9
  pop r8
  pop rbx
  pop rdx
  ret

================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.cpp
================================================
// WinCoreClockChecker.cpp : This file contains the 'main' function. Program execution begins and ends there.
//

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys\timeb.h>
#include <windows.h>

extern "C" uint64_t clktest(uint64_t iterations, uint64_t *flag);

int ECoreTestOrder[] = { 2, 3, 4, 5, 6, 7, 8, 9 };
int BackwardECoreTestOrder[] = { 9, 8, 7, 6, 5, 4, 3, 2 };
int AlternatingECoreTestOrder[] = { 2, 6, 3, 7, 4, 8, 5, 9 };
int PCoreTestOrder[] = { 12, 10, 14, 16, 18, 0 };
int AllECores[] = { 20, 21, 2, 3, 4, 5, 6, 7, 8, 9 };
int AllCores[] = { 12, 10, 14, 16, 18, 0, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21 };

struct ClockTestData {
    uint64_t iterations;
    uint64_t completed_iterations;
    uint64_t *flag;
};

float* runMtClockTest(int* cores, int nCores);
void PrintResults(int* cores, float* results, int coreCount);
void RunCoreByCoreClockTest(int* cores, int coreCount);
void RunEvenCoreTest(int coreCount);

uint64_t start_iterations = 8e9;

int main(int argc, char *argv[])
{
    // Test E-Cores one by one
    start_iterations = 8e9;

    if (argc > 1)
    {
        int evenCoreCount = atoi(argv[1]);
        printf("Even Cores, core count %d\n");
        RunEvenCoreTest(evenCoreCount);
    }

    int eCoreCount = sizeof(ECoreTestOrder) / sizeof(int);
    printf("E-Cores, Warmup:\n");
    RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));
    printf("E-Cores, filling one cluster first:\n");
    RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));
    printf("E-Cores, filling other cluster first but still one cluster at a time:\n");
    RunCoreByCoreClockTest(BackwardECoreTestOrder, sizeof(BackwardECoreTestOrder) / sizeof(int));
    printf("E-Cores, alternating cores between clusters:\n");
    RunCoreByCoreClockTest(AlternatingECoreTestOrder, sizeof(AlternatingECoreTestOrder) / sizeof(int));
    printf("E-Cores, LPE first:\n");
    RunCoreByCoreClockTest(AllECores, sizeof(AllECores) / sizeof(int));

    start_iterations = 12e9;
    printf("P-Cores, warmup:\n");
    RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));
    printf("P-Cores, fastest core first:\n");
    RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));
    printf("All cores, fastest core first:\n");
    RunCoreByCoreClockTest(AllCores, sizeof(AllCores) / sizeof(int));

    return 0;
}

void RunEvenCoreTest(int coreCount)
{
    int* coreSequence = (int *)malloc(sizeof(int) * coreCount);
    for (int i = 0; i < coreCount; i++)
    {
        coreSequence[i] = i * 2;
    }

    RunCoreByCoreClockTest(coreSequence, coreCount);
    free(coreSequence);
}

void RunCoreByCoreClockTest(int *cores, int coreCount)
{
    float* coreByCoreResults = (float*)malloc(sizeof(float) * coreCount * coreCount);
    memset(coreByCoreResults, 0, sizeof(float) * coreCount * coreCount);
    for (int i = 0; i < coreCount; i++)
    {
        float* results = runMtClockTest(cores, i + 1);
        for (int j = 0; j < (i + 1); j++)
        {
            coreByCoreResults[coreCount * i + j] = results[j];
        }

        free(results);
    }

    PrintResults(cores, coreByCoreResults, coreCount);
    free(coreByCoreResults);
}

void PrintResults(int *cores, float* results, int coreCount)
{
    // print csv header
    for (int i = 0; i < coreCount; i++)
    {
        printf(",%d", cores[i]);
    }

    printf("\n");
    for (int currentCoreCountIndex = 0; currentCoreCountIndex < coreCount; currentCoreCountIndex++)
    {
        printf("%d", currentCoreCountIndex + 1);
        for (int currentCoreIdx = 0; currentCoreIdx < coreCount; currentCoreIdx++)
        {
            float currentResult = results[coreCount * currentCoreCountIndex + currentCoreIdx];
            if (currentResult != 0.0f) printf(",%f", currentResult);
            else printf(",-");
        }

        printf("\n");
    }
}

DWORD WINAPI ClockTestThread(LPVOID param)
{
    struct ClockTestData* testData = (struct ClockTestData*)param;
    testData->completed_iterations = clktest(testData->iterations, testData->flag);
    return 0;
}

// cores = array of test order -> logical core id
float* runMtClockTest(int* cores, int nCores)
{
    struct timeb start, end;
    struct ClockTestData* threadData = (struct ClockTestData*)malloc(sizeof(struct ClockTestData) * nCores);
    float* results = (float*)malloc(sizeof(float) * nCores);
    memset(results, 0, sizeof(float) * nCores);
    HANDLE* testThreads = (HANDLE*)malloc(sizeof(HANDLE) * nCores);

    // try to align test times
    float maxThreadTsc, minThreadTsc;
    float time_diff_sec;
    uint64_t flag = 0;

    for (int i = 0; i < nCores; i++)
    {
        threadData[i].iterations = start_iterations;
        threadData[i].flag = &flag;
        testThreads[i] = CreateThread(NULL, 0, ClockTestThread, threadData + i, CREATE_SUSPENDED, NULL);
        SetThreadAffinityMask(testThreads[i], 1ULL << (uint64_t)cores[i]);
    }

    ftime(&start);
    for (int i = 0; i < nCores; i++)
    {
        ResumeThread(testThreads[i]);
    }

    WaitForMultipleObjects(nCores, testThreads, TRUE, INFINITE);
    ftime(&end);
    time_diff_sec = (float)(end.time - start.time) + 0.001f * (end.millitm - start.millitm);
    for (int i = 0; i < nCores; i++)
    {
        // fprintf(stderr, "Core %d: %llu iterations in %f sec\n", cores[i], threadData[i].completed_iterations, time_diff_sec);
        float ghz = ((float)threadData[i].completed_iterations / 1e9) / time_diff_sec;
        // fprintf(stderr, "Core %d: %f GHz\n", cores[i], ghz);
        results[i] = ghz;
    }

    free(testThreads);
    free(threadData);
    return results;
}

================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.9.34723.18
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinCoreClockChecker", "WinCoreClockChecker.vcxproj", "{D70EC1DD-794C-4156-8483-227E566CC76B}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32
		{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1}
	EndGlobalSection
EndGlobal


================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>17.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{d70ec1dd-794c-4156-8483-227e566cc76b}</ProjectGuid>
    <RootNamespace>WinCoreClockChecker</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="WinCoreClockChecker.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="CoreClockCheckFunctions.asm">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -f win64 CoreClockCheckFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">CoreClockCheckFunctions.obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 CoreClockCheckFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">CoreClockCheckFunctions.obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">nasm -f win64 CoreClockCheckFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">CoreClockCheckFunctions.obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -f win64 CoreClockCheckFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">CoreClockCheckFunctions.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="WinCoreClockChecker.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="CoreClockCheckFunctions.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
</Project>

================================================
FILE: GpuMemLatency/Makefile
================================================
include ../Common/arch_detect.mk

OCL_VER = v2023.04.17
CI_SCRIPT = ../Common/ci_gpumemlatency.sh

CFLAGS = -O3 -I ../Common
DEPS = ../Common/timings.h
OBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o
LDFLAGS ?= -lm -lOpenCL
ifeq ($(TARGET), Darwin)
    LDFLAGS = -lm -framework OpenCL
endif

all: $(TARGET)

GpuMemLatency: $(OBJ)
	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS)

%.o: %.c $(DEPS)
	$(CC) $(CFLAGS) -c -o $@ $<

timing.o:
	$(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o

amd64: $(OBJ)
	$(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS)

aarch64: $(OBJ)
	$(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS)

riscv64: $(OBJ)
	$(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS)

w64: $(OBJ)
	$(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS)

darwin: $(OBJ)
	$(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS)

ci: clean
	@OCL_VER=$(OCL_VER) sh $(CI_SCRIPT)

clean-ci:
	rm -rf "*.deb" "*.zip" "ocl-icd-*" "OpenCL-SDK-*"

clean-obj: 
	rm -f *.o

clean: clean-ci clean-obj
	find . -type f -executable -delete

.PHONY: all ci clean-ci clean-obj clean


================================================
FILE: GpuMemLatency/OpenCL/LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: GpuMemLatency/OpenCL/README.md
================================================
# OpenCL<sup>TM</sup> API Headers

This repository contains C language headers for the OpenCL API.

The authoritative public repository for these headers is located at:

https://github.com/KhronosGroup/OpenCL-Headers

Issues, proposed fixes for issues, and other suggested changes should be
created using Github.

## Branch Structure

The OpenCL API headers in this repository are Unified headers and are designed
to work with all released OpenCL versions.  This differs from previous OpenCL
API headers, where version-specific API headers either existed in separate
branches, or in separate folders in a branch.

## Compiling for a Specific OpenCL Version

By default, the OpenCL API headers in this repository are for the latest
OpenCL version (currently OpenCL 2.2).  To use these API headers to target
a different OpenCL version, an application may `#define` the preprocessor
value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
the OpenCL API version.

For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
include the OpenCL API headers as follows:

```
#define CL_TARGET_OPENCL_VERSION 120
#include <CL/opencl.h>
```

## Directory Structure

```
README.md               This file
LICENSE                 Source license for the OpenCL API headers
CL/                     Unified OpenCL API headers tree
```

## License

See [LICENSE](LICENSE).

---

OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_H
#define __OPENCL_CL_H

#include <CL/cl_version.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************/

typedef struct _cl_platform_id *    cl_platform_id;
typedef struct _cl_device_id *      cl_device_id;
typedef struct _cl_context *        cl_context;
typedef struct _cl_command_queue *  cl_command_queue;
typedef struct _cl_mem *            cl_mem;
typedef struct _cl_program *        cl_program;
typedef struct _cl_kernel *         cl_kernel;
typedef struct _cl_event *          cl_event;
typedef struct _cl_sampler *        cl_sampler;

typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong            cl_bitfield;
typedef cl_bitfield         cl_device_type;
typedef cl_uint             cl_platform_info;
typedef cl_uint             cl_device_info;
typedef cl_bitfield         cl_device_fp_config;
typedef cl_uint             cl_device_mem_cache_type;
typedef cl_uint             cl_device_local_mem_type;
typedef cl_bitfield         cl_device_exec_capabilities;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_device_svm_capabilities;
#endif
typedef cl_bitfield         cl_command_queue_properties;
#ifdef CL_VERSION_1_2
typedef intptr_t            cl_device_partition_property;
typedef cl_bitfield         cl_device_affinity_domain;
#endif

typedef intptr_t            cl_context_properties;
typedef cl_uint             cl_context_info;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_queue_properties;
#endif
typedef cl_uint             cl_command_queue_info;
typedef cl_uint             cl_channel_order;
typedef cl_uint             cl_channel_type;
typedef cl_bitfield         cl_mem_flags;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_svm_mem_flags;
#endif
typedef cl_uint             cl_mem_object_type;
typedef cl_uint             cl_mem_info;
#ifdef CL_VERSION_1_2
typedef cl_bitfield         cl_mem_migration_flags;
#endif
typedef cl_uint             cl_image_info;
#ifdef CL_VERSION_1_1
typedef cl_uint             cl_buffer_create_type;
#endif
typedef cl_uint             cl_addressing_mode;
typedef cl_uint             cl_filter_mode;
typedef cl_uint             cl_sampler_info;
typedef cl_bitfield         cl_map_flags;
#ifdef CL_VERSION_2_0
typedef intptr_t            cl_pipe_properties;
typedef cl_uint             cl_pipe_info;
#endif
typedef cl_uint             cl_program_info;
typedef cl_uint             cl_program_build_info;
#ifdef CL_VERSION_1_2
typedef cl_uint             cl_program_binary_type;
#endif
typedef cl_int              cl_build_status;
typedef cl_uint             cl_kernel_info;
#ifdef CL_VERSION_1_2
typedef cl_uint             cl_kernel_arg_info;
typedef cl_uint             cl_kernel_arg_address_qualifier;
typedef cl_uint             cl_kernel_arg_access_qualifier;
typedef cl_bitfield         cl_kernel_arg_type_qualifier;
#endif
typedef cl_uint             cl_kernel_work_group_info;
#ifdef CL_VERSION_2_1
typedef cl_uint             cl_kernel_sub_group_info;
#endif
typedef cl_uint             cl_event_info;
typedef cl_uint             cl_command_type;
typedef cl_uint             cl_profiling_info;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_sampler_properties;
typedef cl_uint             cl_kernel_exec_info;
#endif
#ifdef CL_VERSION_3_0
typedef cl_bitfield         cl_device_atomic_capabilities;
typedef cl_uint             cl_khronos_vendor_id;
typedef cl_bitfield         cl_mem_properties;
typedef cl_uint             cl_version;
#endif

typedef struct _cl_image_format {
    cl_channel_order        image_channel_order;
    cl_channel_type         image_channel_data_type;
} cl_image_format;

#ifdef CL_VERSION_1_2

typedef struct _cl_image_desc {
    cl_mem_object_type      image_type;
    size_t                  image_width;
    size_t                  image_height;
    size_t                  image_depth;
    size_t                  image_array_size;
    size_t                  image_row_pitch;
    size_t                  image_slice_pitch;
    cl_uint                 num_mip_levels;
    cl_uint                 num_samples;
#ifdef CL_VERSION_2_0
#ifdef __GNUC__
    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
#endif
#ifdef _MSC_VER
#pragma warning( push )
#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */
#endif
    union {
#endif
      cl_mem                  buffer;
#ifdef CL_VERSION_2_0
      cl_mem                  mem_object;
    };
#ifdef _MSC_VER
#pragma warning( pop )
#endif
#endif
} cl_image_desc;

#endif

#ifdef CL_VERSION_1_1

typedef struct _cl_buffer_region {
    size_t                  origin;
    size_t                  size;
} cl_buffer_region;

#endif

#ifdef CL_VERSION_3_0

#define CL_NAME_VERSION_MAX_NAME_SIZE 64

typedef struct _cl_name_version {
    cl_version              version;
    char                    name[CL_NAME_VERSION_MAX_NAME_SIZE];
} cl_name_version;

#endif

/******************************************************************************/

/* Error Codes */
#define CL_SUCCESS                                  0
#define CL_DEVICE_NOT_FOUND                         -1
#define CL_DEVICE_NOT_AVAILABLE                     -2
#define CL_COMPILER_NOT_AVAILABLE                   -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
#define CL_OUT_OF_RESOURCES                         -5
#define CL_OUT_OF_HOST_MEMORY                       -6
#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
#define CL_MEM_COPY_OVERLAP                         -8
#define CL_IMAGE_FORMAT_MISMATCH                    -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
#define CL_BUILD_PROGRAM_FAILURE                    -11
#define CL_MAP_FAILURE                              -12
#ifdef CL_VERSION_1_1
#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
#endif
#ifdef CL_VERSION_1_2
#define CL_COMPILE_PROGRAM_FAILURE                  -15
#define CL_LINKER_NOT_AVAILABLE                     -16
#define CL_LINK_PROGRAM_FAILURE                     -17
#define CL_DEVICE_PARTITION_FAILED                  -18
#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
#endif

#define CL_INVALID_VALUE                            -30
#define CL_INVALID_DEVICE_TYPE                      -31
#define CL_INVALID_PLATFORM                         -32
#define CL_INVALID_DEVICE                           -33
#define CL_INVALID_CONTEXT                          -34
#define CL_INVALID_QUEUE_PROPERTIES                 -35
#define CL_INVALID_COMMAND_QUEUE                    -36
#define CL_INVALID_HOST_PTR                         -37
#define CL_INVALID_MEM_OBJECT                       -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
#define CL_INVALID_IMAGE_SIZE                       -40
#define CL_INVALID_SAMPLER                          -41
#define CL_INVALID_BINARY                           -42
#define CL_INVALID_BUILD_OPTIONS                    -43
#define CL_INVALID_PROGRAM                          -44
#define CL_INVALID_PROGRAM_EXECUTABLE               -45
#define CL_INVALID_KERNEL_NAME                      -46
#define CL_INVALID_KERNEL_DEFINITION                -47
#define CL_INVALID_KERNEL                           -48
#define CL_INVALID_ARG_INDEX                        -49
#define CL_INVALID_ARG_VALUE                        -50
#define CL_INVALID_ARG_SIZE                         -51
#define CL_INVALID_KERNEL_ARGS                      -52
#define CL_INVALID_WORK_DIMENSION                   -53
#define CL_INVALID_WORK_GROUP_SIZE                  -54
#define CL_INVALID_WORK_ITEM_SIZE                   -55
#define CL_INVALID_GLOBAL_OFFSET                    -56
#define CL_INVALID_EVENT_WAIT_LIST                  -57
#define CL_INVALID_EVENT                            -58
#define CL_INVALID_OPERATION                        -59
#define CL_INVALID_GL_OBJECT                        -60
#define CL_INVALID_BUFFER_SIZE                      -61
#define CL_INVALID_MIP_LEVEL                        -62
#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
#ifdef CL_VERSION_1_1
#define CL_INVALID_PROPERTY                         -64
#endif
#ifdef CL_VERSION_1_2
#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
#define CL_INVALID_COMPILER_OPTIONS                 -66
#define CL_INVALID_LINKER_OPTIONS                   -67
#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
#endif
#ifdef CL_VERSION_2_0
#define CL_INVALID_PIPE_SIZE                        -69
#define CL_INVALID_DEVICE_QUEUE                     -70
#endif
#ifdef CL_VERSION_2_2
#define CL_INVALID_SPEC_ID                          -71
#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
#endif


/* cl_bool */
#define CL_FALSE                                    0
#define CL_TRUE                                     1
#ifdef CL_VERSION_1_2
#define CL_BLOCKING                                 CL_TRUE
#define CL_NON_BLOCKING                             CL_FALSE
#endif

/* cl_platform_info */
#define CL_PLATFORM_PROFILE                         0x0900
#define CL_PLATFORM_VERSION                         0x0901
#define CL_PLATFORM_NAME                            0x0902
#define CL_PLATFORM_VENDOR                          0x0903
#define CL_PLATFORM_EXTENSIONS                      0x0904
#ifdef CL_VERSION_2_1
#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
#endif
#ifdef CL_VERSION_3_0
#define CL_PLATFORM_NUMERIC_VERSION                 0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION         0x0907
#endif

/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
#define CL_DEVICE_TYPE_CPU                          (1 << 1)
#define CL_DEVICE_TYPE_GPU                          (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
#ifdef CL_VERSION_1_2
#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
#endif
#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF

/* cl_device_info */
#define CL_DEVICE_TYPE                                   0x1000
#define CL_DEVICE_VENDOR_ID                              0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
#define CL_DEVICE_ADDRESS_BITS                           0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
#define CL_DEVICE_MAX_SAMPLERS                           0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
#define CL_DEVICE_AVAILABLE                              0x1027
#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
#ifdef CL_VERSION_2_0
#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
#endif
#define CL_DEVICE_NAME                                   0x102B
#define CL_DEVICE_VENDOR                                 0x102C
#define CL_DRIVER_VERSION                                0x102D
#define CL_DEVICE_PROFILE                                0x102E
#define CL_DEVICE_VERSION                                0x102F
#define CL_DEVICE_EXTENSIONS                             0x1030
#define CL_DEVICE_PLATFORM                               0x1031
#ifdef CL_VERSION_1_2
#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
#endif
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
#ifdef CL_VERSION_1_1
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
#endif
#ifdef CL_VERSION_1_2
#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
#define CL_DEVICE_PARENT_DEVICE                          0x1042
#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
#define CL_DEVICE_PARTITION_TYPE                         0x1046
#define CL_DEVICE_REFERENCE_COUNT                        0x1047
#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
#endif
#ifdef CL_VERSION_2_0
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
#endif
#ifdef CL_VERSION_2_1
#define CL_DEVICE_IL_VERSION                             0x105B
#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
#endif
#ifdef CL_VERSION_3_0
#define CL_DEVICE_NUMERIC_VERSION                        0x105E
#define CL_DEVICE_EXTENSIONS_WITH_VERSION                0x1060
#define CL_DEVICE_ILS_WITH_VERSION                       0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION          0x1062
#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES             0x1063
#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES              0x1064
#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT         0x1065
#define CL_DEVICE_OPENCL_C_ALL_VERSIONS                  0x1066
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE     0x1067
#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068
#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT          0x1069
/* 0x106A to 0x106E - Reserved for upcoming KHR extension */
#define CL_DEVICE_OPENCL_C_FEATURES                      0x106F
#define CL_DEVICE_DEVICE_ENQUEUE_SUPPORT                 0x1070
#define CL_DEVICE_PIPE_SUPPORT                           0x1071
#endif

/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM                                (1 << 0)
#define CL_FP_INF_NAN                               (1 << 1)
#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
#define CL_FP_ROUND_TO_INF                          (1 << 4)
#define CL_FP_FMA                                   (1 << 5)
#ifdef CL_VERSION_1_1
#define CL_FP_SOFT_FLOAT                            (1 << 6)
#endif
#ifdef CL_VERSION_1_2
#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
#endif

/* cl_device_mem_cache_type */
#define CL_NONE                                     0x0
#define CL_READ_ONLY_CACHE                          0x1
#define CL_READ_WRITE_CACHE                         0x2

/* cl_device_local_mem_type */
#define CL_LOCAL                                    0x1
#define CL_GLOBAL                                   0x2

/* cl_device_exec_capabilities - bitfield */
#define CL_EXEC_KERNEL                              (1 << 0)
#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)

/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
#ifdef CL_VERSION_2_0
#define CL_QUEUE_ON_DEVICE                          (1 << 2)
#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
#endif

/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
#define CL_CONTEXT_DEVICES                          0x1081
#define CL_CONTEXT_PROPERTIES                       0x1082
#ifdef CL_VERSION_1_1
#define CL_CONTEXT_NUM_DEVICES                      0x1083
#endif

/* cl_context_properties */
#define CL_CONTEXT_PLATFORM                         0x1084
#ifdef CL_VERSION_1_2
#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
#endif

#ifdef CL_VERSION_1_2

/* cl_device_partition_property */
#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088

#endif

#ifdef CL_VERSION_1_2

/* cl_device_affinity_domain */
#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)

#endif

#ifdef CL_VERSION_2_0

/* cl_device_svm_capabilities */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)

#endif

/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT                            0x1090
#define CL_QUEUE_DEVICE                             0x1091
#define CL_QUEUE_REFERENCE_COUNT                    0x1092
#define CL_QUEUE_PROPERTIES                         0x1093
#ifdef CL_VERSION_2_0
#define CL_QUEUE_SIZE                               0x1094
#endif
#ifdef CL_VERSION_2_1
#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
#endif
#ifdef CL_VERSION_3_0
#define CL_QUEUE_PROPERTIES_ARRAY                   0x1098
#endif

/* cl_mem_flags and cl_svm_mem_flags - bitfield */
#define CL_MEM_READ_WRITE                           (1 << 0)
#define CL_MEM_WRITE_ONLY                           (1 << 1)
#define CL_MEM_READ_ONLY                            (1 << 2)
#define CL_MEM_USE_HOST_PTR                         (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
/* reserved                                         (1 << 6)    */
#ifdef CL_VERSION_1_2
#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
#endif

#ifdef CL_VERSION_1_2

/* cl_mem_migration_flags - bitfield */
#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)

#endif

/* cl_channel_order */
#define CL_R                                        0x10B0
#define CL_A                                        0x10B1
#define CL_RG                                       0x10B2
#define CL_RA                                       0x10B3
#define CL_RGB                                      0x10B4
#define CL_RGBA                                     0x10B5
#define CL_BGRA                                     0x10B6
#define CL_ARGB                                     0x10B7
#define CL_INTENSITY                                0x10B8
#define CL_LUMINANCE                                0x10B9
#ifdef CL_VERSION_1_1
#define CL_Rx                                       0x10BA
#define CL_RGx                                      0x10BB
#define CL_RGBx                                     0x10BC
#endif
#ifdef CL_VERSION_1_2
#define CL_DEPTH                                    0x10BD
#define CL_DEPTH_STENCIL                            0x10BE
#endif
#ifdef CL_VERSION_2_0
#define CL_sRGB                                     0x10BF
#define CL_sRGBx                                    0x10C0
#define CL_sRGBA                                    0x10C1
#define CL_sBGRA                                    0x10C2
#define CL_ABGR                                     0x10C3
#endif

/* cl_channel_type */
#define CL_SNORM_INT8                               0x10D0
#define CL_SNORM_INT16                              0x10D1
#define CL_UNORM_INT8                               0x10D2
#define CL_UNORM_INT16                              0x10D3
#define CL_UNORM_SHORT_565                          0x10D4
#define CL_UNORM_SHORT_555                          0x10D5
#define CL_UNORM_INT_101010                         0x10D6
#define CL_SIGNED_INT8                              0x10D7
#define CL_SIGNED_INT16                             0x10D8
#define CL_SIGNED_INT32                             0x10D9
#define CL_UNSIGNED_INT8                            0x10DA
#define CL_UNSIGNED_INT16                           0x10DB
#define CL_UNSIGNED_INT32                           0x10DC
#define CL_HALF_FLOAT                               0x10DD
#define CL_FLOAT                                    0x10DE
#ifdef CL_VERSION_1_2
#define CL_UNORM_INT24                              0x10DF
#endif
#ifdef CL_VERSION_2_1
#define CL_UNORM_INT_101010_2                       0x10E0
#endif

/* cl_mem_object_type */
#define CL_MEM_OBJECT_BUFFER                        0x10F0
#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
#ifdef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_OBJECT_PIPE                          0x10F7
#endif

/* cl_mem_info */
#define CL_MEM_TYPE                                 0x1100
#define CL_MEM_FLAGS                                0x1101
#define CL_MEM_SIZE                                 0x1102
#define CL_MEM_HOST_PTR                             0x1103
#define CL_MEM_MAP_COUNT                            0x1104
#define CL_MEM_REFERENCE_COUNT                      0x1105
#define CL_MEM_CONTEXT                              0x1106
#ifdef CL_VERSION_1_1
#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
#define CL_MEM_OFFSET                               0x1108
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_USES_SVM_POINTER                     0x1109
#endif
#ifdef CL_VERSION_3_0
#define CL_MEM_PROPERTIES                           0x110A
#endif

/* cl_image_info */
#define CL_IMAGE_FORMAT                             0x1110
#define CL_IMAGE_ELEMENT_SIZE                       0x1111
#define CL_IMAGE_ROW_PITCH                          0x1112
#define CL_IMAGE_SLICE_PITCH                        0x1113
#define CL_IMAGE_WIDTH                              0x1114
#define CL_IMAGE_HEIGHT                             0x1115
#define CL_IMAGE_DEPTH                              0x1116
#ifdef CL_VERSION_1_2
#define CL_IMAGE_ARRAY_SIZE                         0x1117
#define CL_IMAGE_BUFFER                             0x1118
#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
#define CL_IMAGE_NUM_SAMPLES                        0x111A
#endif


/* cl_pipe_info */
#ifdef CL_VERSION_2_0
#define CL_PIPE_PACKET_SIZE                         0x1120
#define CL_PIPE_MAX_PACKETS                         0x1121
#endif
#ifdef CL_VERSION_3_0
#define CL_PIPE_PROPERTIES                          0x1122
#endif

/* cl_addressing_mode */
#define CL_ADDRESS_NONE                             0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
#define CL_ADDRESS_CLAMP                            0x1132
#define CL_ADDRESS_REPEAT                           0x1133
#ifdef CL_VERSION_1_1
#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
#endif

/* cl_filter_mode */
#define CL_FILTER_NEAREST                           0x1140
#define CL_FILTER_LINEAR                            0x1141

/* cl_sampler_info */
#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
#define CL_SAMPLER_CONTEXT                          0x1151
#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
#define CL_SAMPLER_FILTER_MODE                      0x1154
#ifdef CL_VERSION_2_0
/* These enumerants are for the cl_khr_mipmap_image extension.
   They have since been added to cl_ext.h with an appropriate
   KHR suffix, but are left here for backwards compatibility. */
#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
#define CL_SAMPLER_LOD_MIN                          0x1156
#define CL_SAMPLER_LOD_MAX                          0x1157
#endif
#ifdef CL_VERSION_3_0
#define CL_SAMPLER_PROPERTIES                       0x1158
#endif

/* cl_map_flags - bitfield */
#define CL_MAP_READ                                 (1 << 0)
#define CL_MAP_WRITE                                (1 << 1)
#ifdef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
#endif

/* cl_program_info */
#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
#define CL_PROGRAM_CONTEXT                          0x1161
#define CL_PROGRAM_NUM_DEVICES                      0x1162
#define CL_PROGRAM_DEVICES                          0x1163
#define CL_PROGRAM_SOURCE                           0x1164
#define CL_PROGRAM_BINARY_SIZES                     0x1165
#define CL_PROGRAM_BINARIES                         0x1166
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_NUM_KERNELS                      0x1167
#define CL_PROGRAM_KERNEL_NAMES                     0x1168
#endif
#ifdef CL_VERSION_2_1
#define CL_PROGRAM_IL                               0x1169
#endif
#ifdef CL_VERSION_2_2
#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
#endif

/* cl_program_build_info */
#define CL_PROGRAM_BUILD_STATUS                     0x1181
#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
#define CL_PROGRAM_BUILD_LOG                        0x1183
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_BINARY_TYPE                      0x1184
#endif
#ifdef CL_VERSION_2_0
#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
#endif

#ifdef CL_VERSION_1_2

/* cl_program_binary_type */
#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4

#endif

/* cl_build_status */
#define CL_BUILD_SUCCESS                            0
#define CL_BUILD_NONE                               -1
#define CL_BUILD_ERROR                              -2
#define CL_BUILD_IN_PROGRESS                        -3

/* cl_kernel_info */
#define CL_KERNEL_FUNCTION_NAME                     0x1190
#define CL_KERNEL_NUM_ARGS                          0x1191
#define CL_KERNEL_REFERENCE_COUNT                   0x1192
#define CL_KERNEL_CONTEXT                           0x1193
#define CL_KERNEL_PROGRAM                           0x1194
#ifdef CL_VERSION_1_2
#define CL_KERNEL_ATTRIBUTES                        0x1195
#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_info */
#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
#define CL_KERNEL_ARG_NAME                          0x119A

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_address_qualifier */
#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_access_qualifier */
#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_type_qualifier */
#define CL_KERNEL_ARG_TYPE_NONE                     0
#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
#ifdef CL_VERSION_2_0
#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
#endif

#endif

/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
#ifdef CL_VERSION_1_2
#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
#endif

#ifdef CL_VERSION_2_1

/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA

#endif

#ifdef CL_VERSION_2_0

/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7

#endif

/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE                      0x11D0
#define CL_EVENT_COMMAND_TYPE                       0x11D1
#define CL_EVENT_REFERENCE_COUNT                    0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
#ifdef CL_VERSION_1_1
#define CL_EVENT_CONTEXT                            0x11D4
#endif

/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
#define CL_COMMAND_TASK                             0x11F1
#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
#define CL_COMMAND_READ_BUFFER                      0x11F3
#define CL_COMMAND_WRITE_BUFFER                     0x11F4
#define CL_COMMAND_COPY_BUFFER                      0x11F5
#define CL_COMMAND_READ_IMAGE                       0x11F6
#define CL_COMMAND_WRITE_IMAGE                      0x11F7
#define CL_COMMAND_COPY_IMAGE                       0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
#define CL_COMMAND_MAP_BUFFER                       0x11FB
#define CL_COMMAND_MAP_IMAGE                        0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
#define CL_COMMAND_MARKER                           0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
#ifdef CL_VERSION_1_1
#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
#define CL_COMMAND_USER                             0x1204
#endif
#ifdef CL_VERSION_1_2
#define CL_COMMAND_BARRIER                          0x1205
#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
#define CL_COMMAND_FILL_BUFFER                      0x1207
#define CL_COMMAND_FILL_IMAGE                       0x1208
#endif
#ifdef CL_VERSION_2_0
#define CL_COMMAND_SVM_FREE                         0x1209
#define CL_COMMAND_SVM_MEMCPY                       0x120A
#define CL_COMMAND_SVM_MEMFILL                      0x120B
#define CL_COMMAND_SVM_MAP                          0x120C
#define CL_COMMAND_SVM_UNMAP                        0x120D
#endif
#ifdef CL_VERSION_3_0
#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E
#endif

/* command execution status */
#define CL_COMPLETE                                 0x0
#define CL_RUNNING                                  0x1
#define CL_SUBMITTED                                0x2
#define CL_QUEUED                                   0x3

/* cl_buffer_create_type */
#ifdef CL_VERSION_1_1
#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
#endif

/* cl_profiling_info */
#define CL_PROFILING_COMMAND_QUEUED                 0x1280
#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
#define CL_PROFILING_COMMAND_START                  0x1282
#define CL_PROFILING_COMMAND_END                    0x1283
#ifdef CL_VERSION_2_0
#define CL_PROFILING_COMMAND_COMPLETE               0x1284
#endif

/* cl_device_atomic_capabilities - bitfield */
#ifdef CL_VERSION_3_0
#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)
#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)
#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)
#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)
#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES      (1 << 6)
#endif

/* cl_khronos_vendor_id */
#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004

#ifdef CL_VERSION_3_0

/* cl_version */
#define CL_VERSION_MAJOR_BITS (10)
#define CL_VERSION_MINOR_BITS (10)
#define CL_VERSION_PATCH_BITS (12)

#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)
#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)
#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)

#define CL_VERSION_MAJOR(version) \
  ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))

#define CL_VERSION_MINOR(version) \
  (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)

#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)

#define CL_MAKE_VERSION(major, minor, patch)                      \
  ((((major) & CL_VERSION_MAJOR_MASK)                             \
       << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) |      \
   (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \
   ((patch) & CL_VERSION_PATCH_MASK))

#endif

/********************************************************************************************************/

/* Platform API */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformIDs(cl_uint          num_entries,
                 cl_platform_id * platforms,
                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformInfo(cl_platform_id   platform,
                  cl_platform_info param_name,
                  size_t           param_value_size,
                  void *           param_value,
                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Device APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDs(cl_platform_id   platform,
               cl_device_type   device_type,
               cl_uint          num_entries,
               cl_device_id *   devices,
               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceInfo(cl_device_id    device,
                cl_device_info  param_name,
                size_t          param_value_size,
                void *          param_value,
                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevices(cl_device_id                         in_device,
                   const cl_device_partition_property * properties,
                   cl_uint                              num_devices,
                   cl_device_id *                       out_devices,
                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetDefaultDeviceCommandQueue(cl_context           context,
                               cl_device_id         device,
                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceAndHostTimer(cl_device_id    device,
                        cl_ulong*       device_timestamp,
                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetHostTimer(cl_device_id device,
               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;

#endif

/* Context APIs */
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContext(const cl_context_properties * properties,
                cl_uint              num_devices,
                const cl_device_id * devices,
                void (CL_CALLBACK * pfn_notify)(const char * errinfo,
                                                const void * private_info,
                                                size_t       cb,
                                                void *       user_data),
                void *               user_data,
                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContextFromType(const cl_context_properties * properties,
                        cl_device_type      device_type,
                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,
                                                        const void * private_info,
                                                        size_t       cb,
                                                        void *       user_data),
                        void *              user_data,
                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetContextInfo(cl_context         context,
                 cl_context_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Command Queue APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithProperties(cl_context               context,
                                   cl_device_id             device,
                                   const cl_queue_properties *    properties,
                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandQueueInfo(cl_command_queue      command_queue,
                      cl_command_queue_info param_name,
                      size_t                param_value_size,
                      void *                param_value,
                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Memory Object APIs */
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBuffer(cl_context   context,
               cl_mem_flags flags,
               size_t       size,
               void *       host_ptr,
               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateSubBuffer(cl_mem                   buffer,
                  cl_mem_flags             flags,
                  cl_buffer_create_type    buffer_create_type,
                  const void *             buffer_create_info,
                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage(cl_context              context,
              cl_mem_flags            flags,
              const cl_image_format * image_format,
              const cl_image_desc *   image_desc,
              void *                  host_ptr,
              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreatePipe(cl_context                 context,
             cl_mem_flags               flags,
             cl_uint                    pipe_packet_size,
             cl_uint                    pipe_max_packets,
             const cl_pipe_properties * properties,
             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_3_0

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferWithProperties(cl_context                context,
                             const cl_mem_properties * properties,
                             cl_mem_flags              flags,
                             size_t                    size,
                             void *                    host_ptr,
                             cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImageWithProperties(cl_context                context,
                            const cl_mem_properties * properties,
                            cl_mem_flags              flags,
                            const cl_image_format *   image_format,
                            const cl_image_desc *     image_desc,
                            void *                    host_ptr,
                            cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedImageFormats(cl_context           context,
                           cl_mem_flags         flags,
                           cl_mem_object_type   image_type,
                           cl_uint              num_entries,
                           cl_image_format *    image_formats,
                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemObjectInfo(cl_mem           memobj,
                   cl_mem_info      param_name,
                   size_t           param_value_size,
                   void *           param_value,
                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageInfo(cl_mem           image,
               cl_image_info    param_name,
               size_t           param_value_size,
               void *           param_value,
               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clGetPipeInfo(cl_mem           pipe,
              cl_pipe_info     param_name,
              size_t           param_value_size,
              void *           param_value,
              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetMemObjectDestructorCallback(cl_mem memobj,
                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
                                                                 void * user_data),
                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;

#endif

/* SVM Allocation APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY void * CL_API_CALL
clSVMAlloc(cl_context       context,
           cl_svm_mem_flags flags,
           size_t           size,
           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY void CL_API_CALL
clSVMFree(cl_context        context,
          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;

#endif

/* Sampler APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_sampler CL_API_CALL
clCreateSamplerWithProperties(cl_context                     context,
                              const cl_sampler_properties *  sampler_properties,
                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSamplerInfo(cl_sampler         sampler,
                 cl_sampler_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context        context,
                          cl_uint           count,
                          const char **     strings,
                          const size_t *    lengths,
                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBinary(cl_context                     context,
                          cl_uint                        num_devices,
                          const cl_device_id *           device_list,
                          const size_t *                 lengths,
                          const unsigned char **         binaries,
                          cl_int *                       binary_status,
                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBuiltInKernels(cl_context            context,
                                  cl_uint               num_devices,
                                  const cl_device_id *  device_list,
                                  const char *          kernel_names,
                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithIL(cl_context    context,
                     const void*    il,
                     size_t         length,
                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clBuildProgram(cl_program           program,
               cl_uint              num_devices,
               const cl_device_id * device_list,
               const char *         options,
               void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                                void * user_data),
               void *               user_data) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clCompileProgram(cl_program           program,
                 cl_uint              num_devices,
                 const cl_device_id * device_list,
                 const char *         options,
                 cl_uint              num_input_headers,
                 const cl_program *   input_headers,
                 const char **        header_include_names,
                 void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                                  void * user_data),
                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_program CL_API_CALL
clLinkProgram(cl_context           context,
              cl_uint              num_devices,
              const cl_device_id * device_list,
              const char *         options,
              cl_uint              num_input_programs,
              const cl_program *   input_programs,
              void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                               void * user_data),
              void *               user_data,
              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_2

extern CL_API_ENTRY cl_int CL_API_CALL
clSetProgramReleaseCallback(cl_program          program,
                            void (CL_CALLBACK * pfn_notify)(cl_program program,
                                                            void * user_data),
                            void *              user_data) CL_API_SUFFIX__VERSION_2_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetProgramSpecializationConstant(cl_program  program,
                                   cl_uint     spec_id,
                                   size_t      spec_size,
                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program         program,
                 cl_program_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramBuildInfo(cl_program            program,
                      cl_device_id          device,
                      cl_program_build_info param_name,
                      size_t                param_value_size,
                      void *                param_value,
                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Kernel Object APIs */
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCreateKernel(cl_program      program,
               const char *    kernel_name,
               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clCreateKernelsInProgram(cl_program     program,
                         cl_uint        num_kernels,
                         cl_kernel *    kernels,
                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_kernel CL_API_CALL
clCloneKernel(cl_kernel     source_kernel,
              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArg(cl_kernel    kernel,
               cl_uint      arg_index,
               size_t       arg_size,
               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointer(cl_kernel    kernel,
                         cl_uint      arg_index,
                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfo(cl_kernel            kernel,
                    cl_kernel_exec_info  param_name,
                    size_t               param_value_size,
                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel       kernel,
                cl_kernel_info  param_name,
                size_t          param_value_size,
                void *          param_value,
                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelArgInfo(cl_kernel       kernel,
                   cl_uint         arg_indx,
                   cl_kernel_arg_info  param_name,
                   size_t          param_value_size,
                   void *          param_value,
                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelWorkGroupInfo(cl_kernel                  kernel,
                         cl_device_id               device,
                         cl_kernel_work_group_info  param_name,
                         size_t                     param_value_size,
                         void *                     param_value,
                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfo(cl_kernel                   kernel,
                        cl_device_id                device,
                        cl_kernel_sub_group_info    param_name,
                        size_t                      input_value_size,
                        const void*                 input_value,
                        size_t                      param_value_size,
                        void*                       param_value,
                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

/* Event Object APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clWaitForEvents(cl_uint             num_events,
                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventInfo(cl_event         event,
               cl_event_info    param_name,
               size_t           param_value_size,
               void *           param_value,
               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateUserEvent(cl_context    context,
                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetUserEventStatus(cl_event   event,
                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetEventCallback(cl_event    event,
                   cl_int      command_exec_callback_type,
                   void (CL_CALLBACK * pfn_notify)(cl_event event,
                                                   cl_int   event_command_status,
                                                   void *   user_data),
                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;

#endif

/* Profiling APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event            event,
                        cl_profiling_info   param_name,
                        size_t              param_value_size,
                        void *              param_value,
                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Flush and Finish APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

/* Enqueued Commands APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBuffer(cl_command_queue    command_queue,
                    cl_mem              buffer,
                    cl_bool             blocking_read,
                    size_t              offset,
                    size_t              size,
                    void *              ptr,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBufferRect(cl_command_queue    command_queue,
                        cl_mem              buffer,
                        cl_bool             blocking_read,
                        const size_t *      buffer_offset,
                        const size_t *      host_offset,
                        const size_t *      region,
                        size_t              buffer_row_pitch,
                        size_t              buffer_slice_pitch,
                        size_t              host_row_pitch,
                        size_t              host_slice_pitch,
                        void *              ptr,
                        cl_uint             num_events_in_wait_list,
                        const cl_event *    event_wait_list,
                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBuffer(cl_command_queue   command_queue,
                     cl_mem             buffer,
                     cl_bool            blocking_write,
                     size_t             offset,
                     size_t             size,
                     const void *       ptr,
                     cl_uint            num_events_in_wait_list,
                     const cl_event *   event_wait_list,
                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBufferRect(cl_command_queue    command_queue,
                         cl_mem              buffer,
                         cl_bool             blocking_write,
                         const size_t *      buffer_offset,
                         const size_t *      host_offset,
                         const size_t *      region,
                         size_t              buffer_row_pitch,
                         size_t              buffer_slice_pitch,
                         size_t              host_row_pitch,
                         size_t              host_slice_pitch,
                         const void *        ptr,
                         cl_uint             num_events_in_wait_list,
                         const cl_event *    event_wait_list,
                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillBuffer(cl_command_queue   command_queue,
                    cl_mem             buffer,
                    const void *       pattern,
                    size_t             pattern_size,
                    size_t             offset,
                    size_t             size,
                    cl_uint            num_events_in_wait_list,
                    const cl_event *   event_wait_list,
                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBuffer(cl_command_queue    command_queue,
                    cl_mem              src_buffer,
                    cl_mem              dst_buffer,
                    size_t              src_offset,
                    size_t              dst_offset,
                    size_t              size,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferRect(cl_command_queue    command_queue,
                        cl_mem              src_buffer,
                        cl_mem              dst_buffer,
                        const size_t *      src_origin,
                        const size_t *      dst_origin,
                        const size_t *      region,
                        size_t              src_row_pitch,
                        size_t              src_slice_pitch,
                        size_t              dst_row_pitch,
                        size_t              dst_slice_pitch,
                        cl_uint             num_events_in_wait_list,
                        const cl_event *    event_wait_list,
                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadImage(cl_command_queue     command_queue,
                   cl_mem               image,
                   cl_bool              blocking_read,
                   const size_t *       origin,
                   const size_t *       region,
                   size_t               row_pitch,
                   size_t               slice_pitch,
                   void *               ptr,
                   cl_uint              num_events_in_wait_list,
                   const cl_event *     event_wait_list,
                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteImage(cl_command_queue    command_queue,
                    cl_mem              image,
                    cl_bool             blocking_write,
                    const size_t *      origin,
                    const size_t *      region,
                    size_t              input_row_pitch,
                    size_t              input_slice_pitch,
                    const void *        ptr,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillImage(cl_command_queue   command_queue,
                   cl_mem             image,
                   const void *       fill_color,
                   const size_t *     origin,
                   const size_t *     region,
                   cl_uint            num_events_in_wait_list,
                   const cl_event *   event_wait_list,
                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImage(cl_command_queue     command_queue,
                   cl_mem               src_image,
                   cl_mem               dst_image,
                   const size_t *       src_origin,
                   const size_t *       dst_origin,
                   const size_t *       region,
                   cl_uint              num_events_in_wait_list,
                   const cl_event *     event_wait_list,
                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
                           cl_mem           src_image,
                           cl_mem           dst_buffer,
                           const size_t *   src_origin,
                           const size_t *   region,
                           size_t           dst_offset,
                           cl_uint          num_events_in_wait_list,
                           const cl_event * event_wait_list,
                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferToImage(cl_command_queue command_queue,
                           cl_mem           src_buffer,
                           cl_mem           dst_image,
                           size_t           src_offset,
                           const size_t *   dst_origin,
                           const size_t *   region,
                           cl_uint          num_events_in_wait_list,
                           const cl_event * event_wait_list,
                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapBuffer(cl_command_queue command_queue,
                   cl_mem           buffer,
                   cl_bool          blocking_map,
                   cl_map_flags     map_flags,
                   size_t           offset,
                   size_t           size,
                   cl_uint          num_events_in_wait_list,
                   const cl_event * event_wait_list,
                   cl_event *       event,
                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapImage(cl_command_queue  command_queue,
                  cl_mem            image,
                  cl_bool           blocking_map,
                  cl_map_flags      map_flags,
                  const size_t *    origin,
                  const size_t *    region,
                  size_t *          image_row_pitch,
                  size_t *          image_slice_pitch,
                  cl_uint           num_events_in_wait_list,
                  const cl_event *  event_wait_list,
                  cl_event *        event,
                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueUnmapMemObject(cl_command_queue command_queue,
                        cl_mem           memobj,
                        void *           mapped_ptr,
                        cl_uint          num_events_in_wait_list,
                        const cl_event * event_wait_list,
                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
                           cl_uint                num_mem_objects,
                           const cl_mem *         mem_objects,
                           cl_mem_migration_flags flags,
                           cl_uint                num_events_in_wait_list,
                           const cl_event *       event_wait_list,
                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNDRangeKernel(cl_command_queue command_queue,
                       cl_kernel        kernel,
                       cl_uint          work_dim,
                       const size_t *   global_work_offset,
                       const size_t *   global_work_size,
                       const size_t *   local_work_size,
                       cl_uint          num_events_in_wait_list,
                       const cl_event * event_wait_list,
                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue  command_queue,
                      void (CL_CALLBACK * user_func)(void *),
                      void *            args,
                      size_t            cb_args,
                      cl_uint           num_mem_objects,
                      const cl_mem *    mem_list,
                      const void **     args_mem_loc,
                      cl_uint           num_events_in_wait_list,
                      const cl_event *  event_wait_list,
                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMarkerWithWaitList(cl_command_queue  command_queue,
                            cl_uint           num_events_in_wait_list,
                            const cl_event *  event_wait_list,
                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueBarrierWithWaitList(cl_command_queue  command_queue,
                             cl_uint           num_events_in_wait_list,
                             const cl_event *  event_wait_list,
                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFree(cl_command_queue  command_queue,
                 cl_uint           num_svm_pointers,
                 void *            svm_pointers[],
                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
                                                    cl_uint          num_svm_pointers,
                                                    void *           svm_pointers[],
                                                    void *           user_data),
                 void *            user_data,
                 cl_uint           num_events_in_wait_list,
                 const cl_event *  event_wait_list,
                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpy(cl_command_queue  command_queue,
                   cl_bool           blocking_copy,
                   void *            dst_ptr,
                   const void *      src_ptr,
                   size_t            size,
                   cl_uint           num_events_in_wait_list,
                   const cl_event *  event_wait_list,
                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFill(cl_command_queue  command_queue,
                    void *            svm_ptr,
                    const void *      pattern,
                    size_t            pattern_size,
                    size_t            size,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMap(cl_command_queue  command_queue,
                cl_bool           blocking_map,
                cl_map_flags      flags,
                void *            svm_ptr,
                size_t            size,
                cl_uint           num_events_in_wait_list,
                const cl_event *  event_wait_list,
                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmap(cl_command_queue  command_queue,
                  void *            svm_ptr,
                  cl_uint           num_events_in_wait_list,
                  const cl_event *  event_wait_list,
                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMigrateMem(cl_command_queue         command_queue,
                       cl_uint                  num_svm_pointers,
                       const void **            svm_pointers,
                       const size_t *           sizes,
                       cl_mem_migration_flags   flags,
                       cl_uint                  num_events_in_wait_list,
                       const cl_event *         event_wait_list,
                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;

#endif

#ifdef CL_VERSION_1_2

/* Extension function access
 *
 * Returns the extension function address for the given function name,
 * or NULL if a valid function can not be found.  The client must
 * check to make sure the address is not NULL, before using or
 * calling the returned function address.
 */
extern CL_API_ENTRY void * CL_API_CALL
clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
    /*
     *  WARNING:
     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
     *
     *  Software developers previously relying on this API are instructed to set the command queue
     *  properties when creating the queue, instead.
     */
    extern CL_API_ENTRY cl_int CL_API_CALL
    clSetCommandQueueProperty(cl_command_queue              command_queue,
                              cl_command_queue_properties   properties,
                              cl_bool                       enable,
                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */

/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage2D(cl_context              context,
                cl_mem_flags            flags,
                const cl_image_format * image_format,
                size_t                  image_width,
                size_t                  image_height,
                size_t                  image_row_pitch,
                void *                  host_ptr,
                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage3D(cl_context              context,
                cl_mem_flags            flags,
                const cl_image_format * image_format,
                size_t                  image_width,
                size_t                  image_height,
                size_t                  image_depth,
                size_t                  image_row_pitch,
                size_t                  image_slice_pitch,
                void *                  host_ptr,
                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueMarker(cl_command_queue    command_queue,
                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueWaitForEvents(cl_command_queue  command_queue,
                        cl_uint          num_events,
                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

/* Deprecated OpenCL 2.0 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
clCreateCommandQueue(cl_context                     context,
                     cl_device_id                   device,
                     cl_command_queue_properties    properties,
                     cl_int *                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
clCreateSampler(cl_context          context,
                cl_bool             normalized_coords,
                cl_addressing_mode  addressing_mode,
                cl_filter_mode      filter_mode,
                cl_int *            errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
clEnqueueTask(cl_command_queue  command_queue,
              cl_kernel         kernel,
              cl_uint           num_events_in_wait_list,
              const cl_event *  event_wait_list,
              cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d10.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_D3D10_H
#define __OPENCL_CL_D3D10_H

#include <d3d10.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************
 * cl_khr_d3d10_sharing                                                       */
#define cl_khr_d3d10_sharing 1

typedef cl_uint cl_d3d10_device_source_khr;
typedef cl_uint cl_d3d10_device_set_khr;

/******************************************************************************/

/* Error Codes */
#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005

/* cl_d3d10_device_source_nv */
#define CL_D3D10_DEVICE_KHR                          0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011

/* cl_d3d10_device_set_nv */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013

/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C

/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015

/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018

/******************************************************************************/

typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
    cl_platform_id             platform,
    cl_d3d10_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d10_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D10Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_D3D10_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d11.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_D3D11_H
#define __OPENCL_CL_D3D11_H

#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************
 * cl_khr_d3d11_sharing                                                       */
#define cl_khr_d3d11_sharing 1

typedef cl_uint cl_d3d11_device_source_khr;
typedef cl_uint cl_d3d11_device_set_khr;

/******************************************************************************/

/* Error Codes */
#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009

/* cl_d3d11_device_source */
#define CL_D3D11_DEVICE_KHR                          0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A

/* cl_d3d11_device_set */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C

/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D

/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E

/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021

/******************************************************************************/

typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
    cl_platform_id             platform,
    cl_d3d11_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d11_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D11Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_D3D11_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_H

#include <CL/cl.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************/
/* cl_khr_dx9_media_sharing                                                   */
#define cl_khr_dx9_media_sharing 1

typedef cl_uint             cl_dx9_media_adapter_type_khr;
typedef cl_uint             cl_dx9_media_adapter_set_khr;

#if defined(_WIN32)
#include <d3d9.h>
typedef struct _cl_dx9_surface_info_khr
{
    IDirect3DSurface9 *resource;
    HANDLE shared_handle;
} cl_dx9_surface_info_khr;
#endif


/******************************************************************************/

/* Error Codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013

/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR                              0x2020
#define CL_ADAPTER_D3D9EX_KHR                            0x2021
#define CL_ADAPTER_DXVA_KHR                              0x2022

/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024

/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027

/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029

/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C

/******************************************************************************/

typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
    cl_platform_id                   platform,
    cl_uint                          num_media_adapters,
    cl_dx9_media_adapter_type_khr *  media_adapter_type,
    void *                           media_adapters,
    cl_dx9_media_adapter_set_khr     media_adapter_set,
    cl_uint                          num_entries,
    cl_device_id *                   devices,
    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
    cl_context                    context,
    cl_mem_flags                  flags,
    cl_dx9_media_adapter_type_khr adapter_type,
    void *                        surface_info,
    cl_uint                       plane,
    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
/*****************************************************************************\

Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.

THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File Name: cl_dx9_media_sharing_intel.h

Abstract:

Notes:

\*****************************************************************************/

#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H

#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <d3d9.h>
#include <dxvahd.h>
#include <wtypes.h>
#include <d3d9types.h>

#ifdef __cplusplus
extern "C" {
#endif

/***************************************
* cl_intel_dx9_media_sharing extension *
****************************************/

#define cl_intel_dx9_media_sharing 1

typedef cl_uint cl_dx9_device_source_intel;
typedef cl_uint cl_dx9_device_set_intel;

/* error codes */
#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013

/* cl_dx9_device_source_intel */
#define CL_D3D9_DEVICE_INTEL                          0x4022
#define CL_D3D9EX_DEVICE_INTEL                        0x4070
#define CL_DXVA_DEVICE_INTEL                          0x4071

/* cl_dx9_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025

/* cl_context_info */
#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073

/* cl_mem_info */
#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074

/* cl_image_info */
#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
/******************************************************************************/

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL(
    cl_platform_id              platform,
    cl_dx9_device_source_intel  dx9_device_source,
    void*                       dx9_object,
    cl_dx9_device_set_intel     dx9_device_set,
    cl_uint                     num_entries,
    cl_device_id*               devices,
    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
    cl_platform_id              platform,
    cl_dx9_device_source_intel  dx9_device_source,
    void*                       dx9_object,
    cl_dx9_device_set_intel     dx9_device_set,
    cl_uint                     num_entries,
    cl_device_id*               devices,
    cl_uint*                    num_devices) CL_EXT_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL(
    cl_context                  context,
    cl_mem_flags                flags,
    IDirect3DSurface9*          resource,
    HANDLE                      sharedHandle,
    UINT                        plane,
    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
    cl_context                  context,
    cl_mem_flags                flags,
    IDirect3DSurface9*          resource,
    HANDLE                      sharedHandle,
    UINT                        plane,
    cl_int*                     errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    const cl_mem*               mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    const cl_mem*               mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    cl_mem*                     mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
    cl_command_queue            command_queue,
    cl_uint                     num_objects,
    cl_mem*                     mem_objects,
    cl_uint                     num_events_in_wait_list,
    const cl_event*             event_wait_list,
    cl_event*                   event) CL_EXT_SUFFIX__VERSION_1_1;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_egl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_EGL_H
#define __OPENCL_CL_EGL_H

#include <CL/cl.h>

#ifdef __cplusplus
extern "C" {
#endif


/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E

/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR             -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092

/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void* CLeglImageKHR;

/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void* CLeglDisplayKHR;

/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void* CLeglSyncKHR;

/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t cl_egl_image_properties_khr;


#define cl_khr_egl_image 1

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(cl_context                  context,
                        CLeglDisplayKHR             egldisplay,
                        CLeglImageKHR               eglimage,
                        cl_mem_flags                flags,
                        const cl_egl_image_properties_khr * properties,
                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
    cl_context                  context,
    CLeglDisplayKHR             egldisplay,
    CLeglImageKHR               eglimage,
    cl_mem_flags                flags,
    const cl_egl_image_properties_khr * properties,
    cl_int *                    errcode_ret);


extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
                              cl_uint          num_objects,
                              const cl_mem *   mem_objects,
                              cl_uint          num_events_in_wait_list,
                              const cl_event * event_wait_list,
                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event);


extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
                              cl_uint          num_objects,
                              const cl_mem *   mem_objects,
                              cl_uint          num_events_in_wait_list,
                              const cl_event * event_wait_list,
                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event);


#define cl_khr_egl_event 1

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(cl_context      context,
                            CLeglSyncKHR    sync,
                            CLeglDisplayKHR display,
                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
    cl_context      context,
    CLeglSyncKHR    sync,
    CLeglDisplayKHR display,
    cl_int *        errcode_ret);

#ifdef __cplusplus
}
#endif

#endif /* __OPENCL_CL_EGL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_ext.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies.                                   */

#ifndef __CL_EXT_H
#define __CL_EXT_H

#ifdef __cplusplus
extern "C" {
#endif

#include <CL/cl.h>

/* cl_khr_fp64 extension - no extension #define since it has no functions  */
/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */

#if CL_TARGET_OPENCL_VERSION <= 110
#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
#endif

/* cl_khr_fp16 extension - no extension #define since it has no functions  */
#define CL_DEVICE_HALF_FP_CONFIG                    0x1033

/* Memory object destruction
 *
 * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
 *
 * Registers a user callback function that will be called when the memory object is deleted and its resources
 * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
 * stack associated with memobj. The registered user callback functions are called in the reverse order in
 * which they were registered. The user callback functions are called and then the memory object is deleted
 * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
 * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
 * the storage bits for the memory object, can be reused or freed.
 *
 * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
 *
 * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 */
#define cl_APPLE_SetMemObjectDestructor 1
cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;


/* Context Logging Functions
 *
 * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 *
 * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
 */
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
                                            const void * private_info,
                                            size_t       cb,
                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;

/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
                                          const void * private_info,
                                          size_t       cb,
                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;

/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
                                          const void * private_info,
                                          size_t       cb,
                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;


/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1

/* cl_platform_info                                                        */
#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920

/* Additional Error Codes                                                  */
#define CL_PLATFORM_NOT_FOUND_KHR                   -1001

extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint          num_entries,
                       cl_platform_id * platforms,
                       cl_uint *        num_platforms);

typedef CL_API_ENTRY cl_int
(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
                                         cl_platform_id * platforms,
                                         cl_uint *        num_platforms);


/*******************************
 * cl_khr_il_program extension *
 *******************************/
#define cl_khr_il_program 1

/* New property to clGetDeviceInfo for retrieving supported intermediate
 * languages
 */
#define CL_DEVICE_IL_VERSION_KHR                    0x105B

/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
 * program
 */
#define CL_PROGRAM_IL_KHR                           0x1169

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithILKHR(cl_context   context,
                         const void * il,
                         size_t       length,
                         cl_int *     errcode_ret);

typedef CL_API_ENTRY cl_program
(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
                                           const void * il,
                                           size_t       length,
                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

/* Extension: cl_khr_image2d_from_buffer
 *
 * This extension allows a 2D image to be created from a cl_mem buffer without
 * a copy. The type associated with a 2D image created from a buffer in an
 * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
 * built-in functions are supported for 2D images and 2D images created from
 * a buffer.  Similarly, the write_image built-ins are also supported for 2D
 * images created from a buffer.
 *
 * When the 2D image from buffer is created, the client must specify the
 * width, height, image format (i.e. channel order and channel data type)
 * and optionally the row pitch.
 *
 * The pitch specified must be a multiple of
 * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
 * The base address of the buffer must be aligned to
 * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
 */

#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B


/**************************************
 * cl_khr_initialize_memory extension *
 **************************************/

#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030


/**************************************
 * cl_khr_terminate_context extension *
 **************************************/

#define CL_CONTEXT_TERMINATED_KHR                   -1121

#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
#define CL_CONTEXT_TERMINATE_KHR                    0x2032

#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL
clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int
(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;


/*
 * Extension: cl_khr_spir
 *
 * This extension adds support to create an OpenCL program object from a
 * Standard Portable Intermediate Representation (SPIR) instance
 */

#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1


/*****************************************
 * cl_khr_create_command_queue extension *
 *****************************************/
#define cl_khr_create_command_queue 1

typedef cl_bitfield cl_queue_properties_khr;

extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR(cl_context context,
                                      cl_device_id device,
                                      const cl_queue_properties_khr* properties,
                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_command_queue
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
                                                        cl_device_id device,
                                                        const cl_queue_properties_khr* properties,
                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;


/******************************************
* cl_nv_device_attribute_query extension *
******************************************/

/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
#define CL_DEVICE_WARP_SIZE_NV                      0x4003
#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006


/*********************************
* cl_amd_device_attribute_query *
*********************************/

#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD            0x4036
#define CL_DEVICE_TOPOLOGY_AMD                          0x4037
#define CL_DEVICE_BOARD_NAME_AMD                        0x4038
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                0x4039
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD             0x4040
#define CL_DEVICE_SIMD_WIDTH_AMD                        0x4041
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD            0x4042
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD               0x4044
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD          0x4045
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD     0x4046
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                   0x4048
#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD            0x4049
#define CL_DEVICE_GFXIP_MAJOR_AMD                       0x404A
#define CL_DEVICE_GFXIP_MINOR_AMD                       0x404B
#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD            0x404C
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
#define CL_DEVICE_PCIE_ID_AMD                           0x4034


/*********************************
* cl_arm_printf extension
*********************************/

#define CL_PRINTF_CALLBACK_ARM                      0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1


/***********************************
* cl_ext_device_fission extension
***********************************/
#define cl_ext_device_fission   1

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;

typedef cl_ulong  cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT(cl_device_id   in_device,
                      const cl_device_partition_property_ext * properties,
                      cl_uint        num_entries,
                      cl_device_id * out_devices,
                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int
(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
                                         const cl_device_partition_property_ext * properties,
                                         cl_uint        num_entries,
                                         cl_device_id * out_devices,
                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;

/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053

/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058

/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
#define CL_INVALID_PARTITION_COUNT_EXT              -1058
#define CL_INVALID_PARTITION_NAME_EXT               -1059

/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100

/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)


/***********************************
 * cl_ext_migrate_memobject extension definitions
 ***********************************/
#define cl_ext_migrate_memobject 1

typedef cl_bitfield cl_mem_migration_flags_ext;

#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1

#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
                             cl_uint          num_mem_objects,
                             const cl_mem *   mem_objects,
                             cl_mem_migration_flags_ext flags,
                             cl_uint          num_events_in_wait_list,
                             const cl_event * event_wait_list,
                             cl_event *       event);

typedef CL_API_ENTRY cl_int
(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
                                               cl_uint          num_mem_objects,
                                               const cl_mem *   mem_objects,
                                               cl_mem_migration_flags_ext flags,
                                               cl_uint          num_events_in_wait_list,
                                               const cl_event * event_wait_list,
                                               cl_event *       event);


/*********************************
* cl_qcom_ext_host_ptr extension
*********************************/
#define cl_qcom_ext_host_ptr 1

#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)

#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7

typedef cl_uint                                   cl_image_pitch_info_qcom;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(cl_device_id             device,
                         size_t                   image_width,
                         size_t                   image_height,
                         const cl_image_format   *image_format,
                         cl_image_pitch_info_qcom param_name,
                         size_t                   param_value_size,
                         void                    *param_value,
                         size_t                  *param_value_size_ret);

typedef struct _cl_mem_ext_host_ptr
{
    /* Type of external memory allocation. */
    /* Legal values will be defined in layered extensions. */
    cl_uint  allocation_type;

    /* Host cache policy for this external memory allocation. */
    cl_uint  host_cache_policy;

} cl_mem_ext_host_ptr;


/*******************************************
* cl_qcom_ext_host_ptr_iocoherent extension
********************************************/

/* Cache policy specifying io-coherence */
#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9


/*********************************
* cl_qcom_ion_host_ptr extension
*********************************/

#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8

typedef struct _cl_mem_ion_host_ptr
{
    /* Type of external memory allocation. */
    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
    cl_mem_ext_host_ptr  ext_host_ptr;

    /* ION file descriptor */
    int                  ion_filedesc;

    /* Host pointer to the ION allocated memory */
    void*                ion_hostptr;

} cl_mem_ion_host_ptr;


/*********************************
* cl_qcom_android_native_buffer_host_ptr extension
*********************************/

#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6

typedef struct _cl_mem_android_native_buffer_host_ptr
{
    /* Type of external memory allocation. */
    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
    cl_mem_ext_host_ptr  ext_host_ptr;

    /* Virtual pointer to the android native buffer */
    void*                anb_ptr;

} cl_mem_android_native_buffer_host_ptr;


/******************************************
 * cl_img_yuv_image extension *
 ******************************************/

/* Image formats used in clCreateImage */
#define CL_NV21_IMG                                 0x40D0
#define CL_YV12_IMG                                 0x40D1


/******************************************
 * cl_img_cached_allocations extension *
 ******************************************/

/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)


/******************************************
 * cl_img_use_gralloc_ptr extension *
 ******************************************/
#define cl_img_use_gralloc_ptr 1

/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)

/* To be used by clGetEventInfo: */
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3

/* Error code from clEnqueueReleaseGrallocObjectsIMG */
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
                                  cl_uint               num_objects,
                                  const cl_mem *        mem_objects,
                                  cl_uint               num_events_in_wait_list,
                                  const cl_event *      event_wait_list,
                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
                                  cl_uint               num_objects,
                                  const cl_mem *        mem_objects,
                                  cl_uint               num_events_in_wait_list,
                                  const cl_event *      event_wait_list,
                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;


/*********************************
* cl_khr_subgroups extension
*********************************/
#define cl_khr_subgroups 1

#if !defined(CL_VERSION_2_1)
/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
   In hindsight, there should have been a khr suffix on this type for
   the extension, but keeping it un-suffixed to maintain backwards
   compatibility. */
typedef cl_uint             cl_kernel_sub_group_info;
#endif

/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
                           cl_device_id in_device,
                           cl_kernel_sub_group_info param_name,
                           size_t       input_value_size,
                           const void * input_value,
                           size_t       param_value_size,
                           void *       param_value,
                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;

typedef CL_API_ENTRY cl_int
(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
                                              cl_device_id in_device,
                                              cl_kernel_sub_group_info param_name,
                                              size_t       input_value_size,
                                              const void * input_value,
                                              size_t       param_value_size,
                                              void *       param_value,
                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;


/*********************************
* cl_khr_mipmap_image extension
*********************************/

/* cl_sampler_properties */
#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
#define CL_SAMPLER_LOD_MIN_KHR                      0x1156
#define CL_SAMPLER_LOD_MAX_KHR                      0x1157


/*********************************
* cl_khr_priority_hints extension
*********************************/
/* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
#define cl_khr_priority_hints 1

typedef cl_uint  cl_queue_priority_khr;

/* cl_command_queue_properties */
#define CL_QUEUE_PRIORITY_KHR 0x1096

/* cl_queue_priority_khr */
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)


/*********************************
* cl_khr_throttle_hints extension
*********************************/
/* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
#define cl_khr_throttle_hints 1

typedef cl_uint  cl_queue_throttle_khr;

/* cl_command_queue_properties */
#define CL_QUEUE_THROTTLE_KHR 0x1097

/* cl_queue_throttle_khr */
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)


/*********************************
* cl_khr_subgroup_named_barrier
*********************************/
/* This extension define is for backwards compatibility.
   It shouldn't be required since this extension has no new functions. */
#define cl_khr_subgroup_named_barrier 1

/* cl_device_info */
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035


/*********************************
* cl_khr_extended_versioning
*********************************/

#define cl_khr_extended_versioning 1

#define CL_VERSION_MAJOR_BITS_KHR (10)
#define CL_VERSION_MINOR_BITS_KHR (10)
#define CL_VERSION_PATCH_BITS_KHR (12)

#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)

#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)

#define CL_MAKE_VERSION_KHR(major, minor, patch) \
    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
    ((patch) & CL_VERSION_PATCH_MASK_KHR))

typedef cl_uint cl_version_khr;

#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64

typedef struct _cl_name_version_khr
{
    cl_version_khr version;
    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
} cl_name_version_khr;

/* cl_platform_info */
#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907

/* cl_device_info */
#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062


/*********************************
* cl_khr_device_uuid extension
*********************************/
#define cl_khr_device_uuid 1

#define CL_UUID_SIZE_KHR 16
#define CL_LUID_SIZE_KHR 8

#define CL_DEVICE_UUID_KHR          0x106A
#define CL_DRIVER_UUID_KHR          0x106B
#define CL_DEVICE_LUID_VALID_KHR    0x106C
#define CL_DEVICE_LUID_KHR          0x106D
#define CL_DEVICE_NODE_MASK_KHR     0x106E


/**********************************
 * cl_arm_import_memory extension *
 **********************************/
#define cl_arm_import_memory 1

typedef intptr_t cl_import_properties_arm;

/* Default and valid proporties name for cl_arm_import_memory */
#define CL_IMPORT_TYPE_ARM                        0x40B2

/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3

/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4

/* Protected memory property */
#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5

/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2

/* Data consistency with host property */
#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3

/* Import memory size value to indicate a size for the whole buffer */
#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX

/* This extension adds a new function that allows for direct memory import into
 * OpenCL via the clImportMemoryARM function.
 *
 * Memory imported through this interface will be mapped into the device's page
 * tables directly, providing zero copy access. It will never fall back to copy
 * operations and aliased buffers.
 *
 * Types of memory supported for import are specified as additional extension
 * strings.
 *
 * This extension produces cl_mem allocations which are compatible with all other
 * users of cl_mem in the standard API.
 *
 * This extension maps pages with the same properties as the normal buffer creation
 * function clCreateBuffer.
 */
extern CL_API_ENTRY cl_mem CL_API_CALL
clImportMemoryARM( cl_context context,
                   cl_mem_flags flags,
                   const cl_import_properties_arm *properties,
                   void *memory,
                   size_t size,
                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;


/******************************************
 * cl_arm_shared_virtual_memory extension *
 ******************************************/
#define cl_arm_shared_virtual_memory 1

/* Used by clGetDeviceInfo */
#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6

/* Used by clGetMemObjectInfo */
#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7

/* Used by clSetKernelExecInfoARM: */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9

/* To be used by clGetEventInfo: */
#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE

/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)

/* Flag values used by clSVMAllocARM: */
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)

typedef cl_bitfield cl_svm_mem_flags_arm;
typedef cl_uint     cl_kernel_exec_info_arm;
typedef cl_bitfield cl_device_svm_capabilities_arm;

extern CL_API_ENTRY void * CL_API_CALL
clSVMAllocARM(cl_context       context,
              cl_svm_mem_flags_arm flags,
              size_t           size,
              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY void CL_API_CALL
clSVMFreeARM(cl_context        context,
             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFreeARM(cl_command_queue  command_queue,
                    cl_uint           num_svm_pointers,
                    void *            svm_pointers[],
                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
                                                       cl_uint          num_svm_pointers,
                                                       void *           svm_pointers[],
                                                       void *           user_data),
                    void *            user_data,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
                      cl_bool           blocking_copy,
                      void *            dst_ptr,
                      const void *      src_ptr,
                      size_t            size,
                      cl_uint           num_events_in_wait_list,
                      const cl_event *  event_wait_list,
                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
                       void *            svm_ptr,
                       const void *      pattern,
                       size_t            pattern_size,
                       size_t            size,
                       cl_uint           num_events_in_wait_list,
                       const cl_event *  event_wait_list,
                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMapARM(cl_command_queue  command_queue,
                   cl_bool           blocking_map,
                   cl_map_flags      flags,
                   void *            svm_ptr,
                   size_t            size,
                   cl_uint           num_events_in_wait_list,
                   const cl_event *  event_wait_list,
                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
                     void *            svm_ptr,
                     cl_uint           num_events_in_wait_list,
                     const cl_event *  event_wait_list,
                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointerARM(cl_kernel    kernel,
                            cl_uint      arg_index,
                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfoARM(cl_kernel            kernel,
                       cl_kernel_exec_info_arm  param_name,
                       size_t               param_value_size,
                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;

/********************************
 * cl_arm_get_core_id extension *
 ********************************/

#ifdef CL_VERSION_1_2

#define cl_arm_get_core_id 1

/* Device info property for bitfield of cores present */
#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF

#endif  /* CL_VERSION_1_2 */

/*********************************
* cl_arm_job_slot_selection
*********************************/

#define cl_arm_job_slot_selection 1

/* cl_device_info */
#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0

/* cl_command_queue_properties */
#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1

#ifdef __cplusplus
}
#endif


#endif /* __CL_EXT_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_ext_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/
/*****************************************************************************\

Copyright (c) 2013-2020 Intel Corporation All Rights Reserved.

THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File Name: cl_ext_intel.h

Abstract:

Notes:

\*****************************************************************************/

#ifndef __CL_EXT_INTEL_H
#define __CL_EXT_INTEL_H

#include <CL/cl.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/***************************************
* cl_intel_thread_local_exec extension *
****************************************/

#define cl_intel_thread_local_exec 1

#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)

/***********************************************
* cl_intel_device_partition_by_names extension *
************************************************/

#define cl_intel_device_partition_by_names 1

#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1

/************************************************
* cl_intel_accelerator extension                *
* cl_intel_motion_estimation extension          *
* cl_intel_advanced_motion_estimation extension *
*************************************************/

#define cl_intel_accelerator 1
#define cl_intel_motion_estimation 1
#define cl_intel_advanced_motion_estimation 1

typedef struct _cl_accelerator_intel* cl_accelerator_intel;
typedef cl_uint cl_accelerator_type_intel;
typedef cl_uint cl_accelerator_info_intel;

typedef struct _cl_motion_estimation_desc_intel {
    cl_uint mb_block_type;
    cl_uint subpixel_mode;
    cl_uint sad_adjust_mode;
    cl_uint search_path_type;
} cl_motion_estimation_desc_intel;

/* error codes */
#define CL_INVALID_ACCELERATOR_INTEL                              -1094
#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097

/* cl_accelerator_type_intel */
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0

/* cl_accelerator_info_intel */
#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093

/* cl_motion_detect_desc_intel flags */
#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2

#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2

#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1

#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5

#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4

#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3

#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48

#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3

#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3

#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3

#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8

#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3

/* cl_device_info */
#define CL_DEVICE_ME_VERSION_INTEL                                0x407E

#define CL_ME_VERSION_LEGACY_INTEL                                0x0
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2

extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL(
    cl_context                   context,
    cl_accelerator_type_intel    accelerator_type,
    size_t                       descriptor_size,
    const void*                  descriptor,
    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
    cl_context                   context,
    cl_accelerator_type_intel    accelerator_type,
    size_t                       descriptor_size,
    const void*                  descriptor,
    cl_int*                      errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetAcceleratorInfoINTEL(
    cl_accelerator_intel         accelerator,
    cl_accelerator_info_intel    param_name,
    size_t                       param_value_size,
    void*                        param_value,
    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
    cl_accelerator_intel         accelerator,
    cl_accelerator_info_intel    param_name,
    size_t                       param_value_size,
    void*                        param_value,
    size_t*                      param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainAcceleratorINTEL(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseAcceleratorINTEL(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
    cl_accelerator_intel         accelerator) CL_EXT_SUFFIX__VERSION_1_2;

/******************************************
* cl_intel_simultaneous_sharing extension *
*******************************************/

#define cl_intel_simultaneous_sharing 1

#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105

/***********************************
* cl_intel_egl_image_yuv extension *
************************************/

#define cl_intel_egl_image_yuv 1

#define CL_EGL_YUV_PLANE_INTEL                           0x4107

/********************************
* cl_intel_packed_yuv extension *
*********************************/

#define cl_intel_packed_yuv 1

#define CL_YUYV_INTEL                                    0x4076
#define CL_UYVY_INTEL                                    0x4077
#define CL_YVYU_INTEL                                    0x4078
#define CL_VYUY_INTEL                                    0x4079

/********************************************
* cl_intel_required_subgroup_size extension *
*********************************************/

#define cl_intel_required_subgroup_size 1

#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A

/****************************************
* cl_intel_driver_diagnostics extension *
*****************************************/

#define cl_intel_driver_diagnostics 1

typedef cl_uint cl_diagnostics_verbose_level;

#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106

#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )

/********************************
* cl_intel_planar_yuv extension *
*********************************/

#define CL_NV12_INTEL                                       0x410E

#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )

#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F

/*******************************************************
* cl_intel_device_side_avc_motion_estimation extension *
********************************************************/

#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D

#define CL_AVC_ME_VERSION_0_INTEL                           0x0   /* No support. */
#define CL_AVC_ME_VERSION_1_INTEL                           0x1   /* First supported version. */

#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3

#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3

#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2

#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F

#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa

#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2

#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3

#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3

#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30

#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8

#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000

#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )

#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80

#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2

#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3

#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4

#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3

#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3

#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2

#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1

/*******************************************
* cl_intel_unified_shared_memory extension *
********************************************/

/* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */

#define cl_intel_unified_shared_memory 1

/* cl_device_info */
#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL                   0x4190
#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL                 0x4191
#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL   0x4192
#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL    0x4193
#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL          0x4194

typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;

/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL                   (1 << 0)
#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL            (1 << 1)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL        (1 << 2)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)

typedef cl_bitfield cl_mem_properties_intel;

/* cl_mem_properties_intel */
#define CL_MEM_ALLOC_FLAGS_INTEL        0x4195

typedef cl_bitfield cl_mem_alloc_flags_intel;

/* cl_mem_alloc_flags_intel - bitfield */
#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL               (1 << 0)

typedef cl_uint cl_mem_info_intel;

/* cl_mem_alloc_info_intel */
#define CL_MEM_ALLOC_TYPE_INTEL         0x419A
#define CL_MEM_ALLOC_BASE_PTR_INTEL     0x419B
#define CL_MEM_ALLOC_SIZE_INTEL         0x419C
#define CL_MEM_ALLOC_DEVICE_INTEL       0x419D
/* Enum values 0x419E-0x419F are reserved for future queries. */

typedef cl_uint cl_unified_shared_memory_type_intel;

/* cl_unified_shared_memory_type_intel */
#define CL_MEM_TYPE_UNKNOWN_INTEL       0x4196
#define CL_MEM_TYPE_HOST_INTEL          0x4197
#define CL_MEM_TYPE_DEVICE_INTEL        0x4198
#define CL_MEM_TYPE_SHARED_INTEL        0x4199

typedef cl_uint cl_mem_advice_intel;

/* cl_mem_advice_intel */
/* Enum values 0x4208-0x420F are reserved for future memory advices. */

/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201
#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202
#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203

/* cl_command_type */
#define CL_COMMAND_MEMFILL_INTEL        0x4204
#define CL_COMMAND_MEMCPY_INTEL         0x4205
#define CL_COMMAND_MIGRATEMEM_INTEL     0x4206
#define CL_COMMAND_MEMADVISE_INTEL      0x4207

extern CL_API_ENTRY void* CL_API_CALL
clHostMemAllocINTEL(
            cl_context context,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

typedef CL_API_ENTRY void* (CL_API_CALL *
clHostMemAllocINTEL_fn)(
            cl_context context,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

extern CL_API_ENTRY void* CL_API_CALL
clDeviceMemAllocINTEL(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

typedef CL_API_ENTRY void* (CL_API_CALL *
clDeviceMemAllocINTEL_fn)(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

extern CL_API_ENTRY void* CL_API_CALL
clSharedMemAllocINTEL(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

typedef CL_API_ENTRY void* (CL_API_CALL *
clSharedMemAllocINTEL_fn)(
            cl_context context,
            cl_device_id device,
            const cl_mem_properties_intel* properties,
            size_t size,
            cl_uint alignment,
            cl_int* errcode_ret);

extern CL_API_ENTRY cl_int CL_API_CALL
clMemFreeINTEL(
            cl_context context,
            void* ptr);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clMemFreeINTEL_fn)(
            cl_context context,
            void* ptr);

extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemAllocInfoINTEL(
            cl_context context,
            const void* ptr,
            cl_mem_info_intel param_name,
            size_t param_value_size,
            void* param_value,
            size_t* param_value_size_ret);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clGetMemAllocInfoINTEL_fn)(
            cl_context context,
            const void* ptr,
            cl_mem_info_intel param_name,
            size_t param_value_size,
            void* param_value,
            size_t* param_value_size_ret);

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgMemPointerINTEL(
            cl_kernel kernel,
            cl_uint arg_index,
            const void* arg_value);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clSetKernelArgMemPointerINTEL_fn)(
            cl_kernel kernel,
            cl_uint arg_index,
            const void* arg_value);

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemsetINTEL(       /* Deprecated */
            cl_command_queue command_queue,
            void* dst_ptr,
            cl_int value,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemsetINTEL_fn)(   /* Deprecated */
            cl_command_queue command_queue,
            void* dst_ptr,
            cl_int value,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemFillINTEL(
            cl_command_queue command_queue,
            void* dst_ptr,
            const void* pattern,
            size_t pattern_size,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemFillINTEL_fn)(
            cl_command_queue command_queue,
            void* dst_ptr,
            const void* pattern,
            size_t pattern_size,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemcpyINTEL(
            cl_command_queue command_queue,
            cl_bool blocking,
            void* dst_ptr,
            const void* src_ptr,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemcpyINTEL_fn)(
            cl_command_queue command_queue,
            cl_bool blocking,
            void* dst_ptr,
            const void* src_ptr,
            size_t size,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

#ifdef CL_VERSION_1_2

/* Because these APIs use cl_mem_migration_flags, they require
   OpenCL 1.2: */

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemINTEL(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_migration_flags flags,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMigrateMemINTEL_fn)(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_migration_flags flags,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemAdviseINTEL(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_advice_intel advice,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemAdviseINTEL_fn)(
            cl_command_queue command_queue,
            const void* ptr,
            size_t size,
            cl_mem_advice_intel advice,
            cl_uint num_events_in_wait_list,
            const cl_event* event_wait_list,
            cl_event* event);

#ifdef __cplusplus
}
#endif

#endif /* __CL_EXT_INTEL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_gl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H

#include <CL/cl.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef cl_uint     cl_gl_object_type;
typedef cl_uint     cl_gl_texture_info;
typedef cl_uint     cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;

/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
#define CL_GL_OBJECT_BUFFER                     0x2000
#define CL_GL_OBJECT_TEXTURE2D                  0x2001
#define CL_GL_OBJECT_TEXTURE3D                  0x2002
#define CL_GL_OBJECT_RENDERBUFFER               0x2003
#ifdef CL_VERSION_1_2
#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
#define CL_GL_OBJECT_TEXTURE1D                  0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
#endif

/* cl_gl_texture_info           */
#define CL_GL_TEXTURE_TARGET                    0x2004
#define CL_GL_MIPMAP_LEVEL                      0x2005
#ifdef CL_VERSION_1_2
#define CL_GL_NUM_SAMPLES                       0x2012
#endif


extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context     context,
                     cl_mem_flags   flags,
                     cl_GLuint      bufobj,
                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(cl_context      context,
                      cl_mem_flags    flags,
                      cl_GLenum       target,
                      cl_GLint        miplevel,
                      cl_GLuint       texture,
                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context   context,
                           cl_mem_flags flags,
                           cl_GLuint    renderbuffer,
                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem                memobj,
                  cl_gl_object_type *   gl_object_type,
                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem               memobj,
                   cl_gl_texture_info   param_name,
                   size_t               param_value_size,
                   void *               param_value,
                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
                          cl_uint               num_objects,
                          const cl_mem *        mem_objects,
                          cl_uint               num_events_in_wait_list,
                          const cl_event *      event_wait_list,
                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
                          cl_uint               num_objects,
                          const cl_mem *        mem_objects,
                          cl_uint               num_events_in_wait_list,
                          const cl_event *      event_wait_list,
                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;


/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context      context,
                        cl_mem_flags    flags,
                        cl_GLenum       target,
                        cl_GLint        miplevel,
                        cl_GLuint       texture,
                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context      context,
                        cl_mem_flags    flags,
                        cl_GLenum       target,
                        cl_GLint        miplevel,
                        cl_GLuint       texture,
                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

/* cl_khr_gl_sharing extension  */

#define cl_khr_gl_sharing 1

typedef cl_uint     cl_gl_context_info;

/* Additional Error Codes  */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000

/* cl_gl_context_info  */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007

/* Additional cl_context_properties  */
#define CL_GL_CONTEXT_KHR                       0x2008
#define CL_EGL_DISPLAY_KHR                      0x2009
#define CL_GLX_DISPLAY_KHR                      0x200A
#define CL_WGL_HDC_KHR                          0x200B
#define CL_CGL_SHAREGROUP_KHR                   0x200C

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * properties,
                      cl_gl_context_info            param_name,
                      size_t                        param_value_size,
                      void *                        param_value,
                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
    const cl_context_properties * properties,
    cl_gl_context_info            param_name,
    size_t                        param_value_size,
    void *                        param_value,
    size_t *                      param_value_size_ret);

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_GL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H

#ifdef __cplusplus
extern "C" {
#endif

#include <CL/cl_gl.h>

/*
 *  cl_khr_gl_event extension
 */
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context context,
                           cl_GLsync  cl_GLsync,
                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;

#ifdef __cplusplus
}
#endif

#endif	/* __OPENCL_CL_GL_EXT_H  */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_half.h
================================================
/*******************************************************************************
 * Copyright (c) 2019-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

/**
 * This is a header-only utility library that provides OpenCL host code with
 * routines for converting to/from cl_half values.
 *
 * Example usage:
 *
 *    #include <CL/cl_half.h>
 *    ...
 *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
 *    cl_float f = cl_half_to_float(h);
 */

#ifndef OPENCL_CL_HALF_H
#define OPENCL_CL_HALF_H

#include <CL/cl_platform.h>

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif


/**
 * Rounding mode used when converting to cl_half.
 */
typedef enum
{
  CL_HALF_RTE, // round to nearest even
  CL_HALF_RTZ, // round towards zero
  CL_HALF_RTP, // round towards positive infinity
  CL_HALF_RTN, // round towards negative infinity
} cl_half_rounding_mode;


/* Private utility macros. */
#define CL_HALF_EXP_MASK 0x7C00
#define CL_HALF_MAX_FINITE_MAG 0x7BFF


/*
 * Utility to deal with values that overflow when converting to half precision.
 */
static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
                                              uint16_t sign)
{
  if (rounding_mode == CL_HALF_RTZ)
  {
    // Round overflow towards zero -> largest finite number (preserving sign)
    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTP && sign)
  {
    // Round negative overflow towards positive infinity -> most negative finite number
    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTN && !sign)
  {
    // Round positive overflow towards negative infinity -> largest finite number
    return CL_HALF_MAX_FINITE_MAG;
  }

  // Overflow to infinity
  return (sign << 15) | CL_HALF_EXP_MASK;
}

/*
 * Utility to deal with values that underflow when converting to half precision.
 */
static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
                                               uint16_t sign)
{
  if (rounding_mode == CL_HALF_RTP && !sign)
  {
    // Round underflow towards positive infinity -> smallest positive value
    return (sign << 15) | 1;
  }
  else if (rounding_mode == CL_HALF_RTN && sign)
  {
    // Round underflow towards negative infinity -> largest negative value
    return (sign << 15) | 1;
  }

  // Flush to zero
  return (sign << 15);
}


/**
 * Convert a cl_float to a cl_half.
 */
static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;
  f32.f = f;

  // Extract sign bit
  uint16_t sign = f32.i >> 31;

  // Extract FP32 exponent and mantissa
  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);

  // Remove FP32 exponent bias
  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;

  // Add FP16 exponent bias
  uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1;

  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;

  // Check for NaN / infinity
  if (f_exp == 0xFF)
  {
    if (f_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = f_mant >> lsb_pos;
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }

  // Check for zero
  if (!f_exp && !f_mant)
  {
    return (sign << 15);
  }

  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }

  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }

  // Check for value that will become denormal
  if (exp < -14)
  {
    // Denormal -> include the implicit 1 from the FP32 mantissa
    h_exp = 0;
    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);

    // Mantissa shift amount depends on exponent
    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
  }

  // Generate FP16 mantissa by shifting FP32 mantissa
  uint16_t h_mant = f_mant >> lsb_pos;

  // Check whether we need to round
  uint32_t halfway = 1 << (lsb_pos - 1);
  uint32_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((f_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((f_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((f_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((f_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }

  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }

  return (sign << 15) | (h_exp << 10) | h_mant;
}


/**
 * Convert a cl_double to a cl_half.
 */
static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_double d;
    uint64_t i;
  } f64;
  f64.d = d;

  // Extract sign bit
  uint16_t sign = f64.i >> 63;

  // Extract FP64 exponent and mantissa
  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);

  // Remove FP64 exponent bias
  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;

  // Add FP16 exponent bias
  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);

  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;

  // Check for NaN / infinity
  if (d_exp == 0x7FF)
  {
    if (d_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }

  // Check for zero
  if (!d_exp && !d_mant)
  {
    return (sign << 15);
  }

  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }

  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }

  // Check for value that will become denormal
  if (exp < -14)
  {
    // Include the implicit 1 from the FP64 mantissa
    h_exp = 0;
    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);

    // Mantissa shift amount depends on exponent
    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
  }

  // Generate FP16 mantissa by shifting FP64 mantissa
  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);

  // Check whether we need to round
  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
  uint64_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((d_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((d_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((d_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((d_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }

  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }

  return (sign << 15) | (h_exp << 10) | h_mant;
}


/**
 * Convert a cl_half to a cl_float.
 */
static inline cl_float cl_half_to_float(cl_half h)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;

  // Extract sign bit
  uint16_t sign = h >> 15;

  // Extract FP16 exponent and mantissa
  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
  uint16_t h_mant = h & 0x3FF;

  // Remove FP16 exponent bias
  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;

  // Add FP32 exponent bias
  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;

  // Check for NaN / infinity
  if (h_exp == 0x1F)
  {
    if (h_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
      f_mant |= 0x400000;
      f32.i = (sign << 31) | 0x7F800000 | f_mant;
      return f32.f;
    }
    else
    {
      // Infinity -> zero mantissa
      f32.i = (sign << 31) | 0x7F800000;
      return f32.f;
    }
  }

  // Check for zero / denormal
  if (h_exp == 0)
  {
    if (h_mant == 0)
    {
      // Zero -> zero exponent
      f_exp = 0;
    }
    else
    {
      // Denormal -> normalize it
      // - Shift mantissa to make most-significant 1 implicit
      // - Adjust exponent accordingly
      uint32_t shift = 0;
      while ((h_mant & 0x400) == 0)
      {
        h_mant <<= 1;
        shift++;
      }
      h_mant &= 0x3FF;
      f_exp -= shift - 1;
    }
  }

  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
  return f32.f;
}


#undef CL_HALF_EXP_MASK
#undef CL_HALF_MAX_FINITE_MAG


#ifdef __cplusplus
}
#endif


#endif  /* OPENCL_CL_HALF_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_icd.h
================================================
/*******************************************************************************
 * Copyright (c) 2019-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_ICD_H
#define OPENCL_CL_ICD_H

#include <CL/cl.h>
#include <CL/cl_egl.h>
#include <CL/cl_ext.h>
#include <CL/cl_gl.h>

#if defined(_WIN32)
#include <CL/cl_d3d11.h>
#include <CL/cl_d3d10.h>
#include <CL/cl_dx9_media_sharing.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

/*
 * This file contains pointer type definitions for each of the CL API calls as
 * well as a type definition for the dispatch table used by the Khronos ICD
 * loader (see cl_khr_icd extension specification for background).
 */

/* API function pointer definitions */

// Platform APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
    cl_uint num_entries, cl_platform_id *platforms,
    cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
    cl_platform_id platform, cl_platform_info param_name,
    size_t param_value_size, void *param_value,
    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

// Device APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
    cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
    cl_device_id device, cl_device_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
    cl_device_id in_device,
    const cl_device_partition_property *partition_properties,
    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clCreateSubDevices;
typedef void *cl_api_clRetainDevice;
typedef void *cl_api_clReleaseDevice;

#endif

// Context APIs
typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(
    const cl_context_properties *properties, cl_uint num_devices,
    const cl_device_id *devices,
    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
    const cl_context_properties *properties, cl_device_type device_type,
    void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(
    cl_context context) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(
    cl_context context) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
    cl_context context, cl_context_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

// Command Queue APIs
typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
    cl_context context, cl_device_id device,
    cl_command_queue_properties properties,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

typedef CL_API_ENTRY
cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(
    cl_context /* context */, cl_device_id /* device */,
    const cl_queue_properties * /* properties */,
    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;

#else

typedef void *cl_api_clCreateCommandQueueWithProperties;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
    cl_command_queue command_queue, cl_command_queue_info param_name,
    size_t param_value_size, void *param_value,
    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

// Memory Object APIs
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
    cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(
    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
    const cl_image_desc *image_desc, void *host_ptr,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clCreateImage;

#endif

#ifdef CL_VERSION_3_0

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)(
    cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
    size_t size, void *host_ptr,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)(
    cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
    const cl_image_format *image_format, const cl_image_desc *image_desc,
    void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;

#else

typedef void *cl_api_clCreateBufferWithProperties;
typedef void *cl_api_clCreateImageWithProperties;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
    cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
    cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,
    cl_uint num_entries, cl_image_format *image_formats,
    cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
    cl_mem memobj, cl_mem_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
    cl_mem image, cl_image_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
    cl_context /* context */, cl_mem_flags /* flags */,
    cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
    const cl_pipe_properties * /* properties */,
    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
    cl_mem /* pipe */, cl_pipe_info /* param_name */,
    size_t /* param_value_size */, void * /* param_value */,
    size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(
    cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
    unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(
    cl_context /* context */,
    void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;

#else

typedef void *cl_api_clCreatePipe;
typedef void *cl_api_clGetPipeInfo;
typedef void *cl_api_clSVMAlloc;
typedef void *cl_api_clSVMFree;

#endif

// Sampler APIs
typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
    cl_context context, cl_bool normalized_coords,
    cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(
    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
    cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
    cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

typedef CL_API_ENTRY
cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(
    cl_context /* context */,
    const cl_sampler_properties * /* sampler_properties */,
    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;

#else

typedef void *cl_api_clCreateSamplerWithProperties;

#endif

// Program Object APIs
typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
    cl_context context, cl_uint count, const char **strings,
    const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
    const size_t *lengths, const unsigned char **binaries,
    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY
cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(
    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
    const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clCreateProgramWithBuiltInKernels;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(
    cl_program program) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
    cl_program program) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
    const char *options,
    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
    void *user_data) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(
    cl_program program, cl_uint num_devices, const cl_device_id *device_list,
    const char *options, cl_uint num_input_headers,
    const cl_program *input_headers, const char **header_include_names,
    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
    void *user_data) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(
    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
    const char *options, cl_uint num_input_programs,
    const cl_program *input_programs,
    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
    void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clCompileProgram;
typedef void *cl_api_clLinkProgram;

#endif

#ifdef CL_VERSION_2_2

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(
    cl_program program, cl_uint spec_id, size_t spec_size,
    const void *spec_value) CL_API_SUFFIX__VERSION_2_2;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
    cl_program program,
    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
    void *user_data) CL_API_SUFFIX__VERSION_2_2;

#else

typedef void *cl_api_clSetProgramSpecializationConstant;
typedef void *cl_api_clSetProgramReleaseCallback;

#endif

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
    cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clUnloadPlatformCompiler;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
    cl_program program, cl_program_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
    cl_program program, cl_device_id device, cl_program_build_info param_name,
    size_t param_value_size, void *param_value,
    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

// Kernel Object APIs
typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
    cl_program program, const char *kernel_name,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
    cl_program program, cl_uint num_kernels, cl_kernel *kernels,
    cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(
    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
    cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
    cl_kernel kernel, cl_uint arg_index, size_t arg_size,
    const void *arg_value) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
    cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
    cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,
    size_t param_value_size, void *param_value,
    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clGetKernelArgInfo;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
    cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
    size_t param_value_size, void *param_value,
    size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
    cl_kernel /* kernel */, cl_uint /* arg_index */,
    const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
    cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
    size_t /* param_value_size */,
    const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
    cl_kernel /* in_kernel */, cl_device_id /*in_device*/,
    cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,
    const void * /*input_value*/, size_t /*param_value_size*/,
    void * /*param_value*/,
    size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;

#else

typedef void *cl_api_clSetKernelArgSVMPointer;
typedef void *cl_api_clSetKernelExecInfo;
typedef void *cl_api_clGetKernelSubGroupInfoKHR;

#endif

// Event Object APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
    cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
    cl_event event, cl_event_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
    CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
    CL_API_SUFFIX__VERSION_1_0;

// Profiling APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
    cl_event event, cl_profiling_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

// Flush and Finish APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(
    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(
    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

// Enqueued Commands APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
    size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
    const size_t *buffer_origin, const size_t *host_origin,
    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
    size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_1;

#else

typedef void *cl_api_clEnqueueReadBufferRect;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
    size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
    const size_t *buffer_origin, const size_t *host_origin,
    const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
    size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_1;

#else

typedef void *cl_api_clEnqueueWriteBufferRect;

#endif

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
    cl_command_queue command_queue, cl_mem buffer, const void *pattern,
    size_t pattern_size, size_t offset, size_t cb,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clEnqueueFillBuffer;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
    size_t src_offset, size_t dst_offset, size_t cb,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
    size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
    size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_1;

#else

typedef void *cl_api_clEnqueueCopyBufferRect;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
    cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
    const size_t *origin, const size_t *region, size_t row_pitch,
    size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
    cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
    const size_t *origin, const size_t *region, size_t input_row_pitch,
    size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
    cl_command_queue command_queue, cl_mem image, const void *fill_color,
    const size_t origin[3], const size_t region[3],
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clEnqueueFillImage;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
    const size_t *src_origin, const size_t *dst_origin, const size_t *region,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
    cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
    const size_t *src_origin, const size_t *region, size_t dst_offset,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
    cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
    size_t src_offset, const size_t *dst_origin, const size_t *region,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
    cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
    cl_map_flags map_flags, size_t offset, size_t cb,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
    cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
    cl_map_flags map_flags, const size_t *origin, const size_t *region,
    size_t *image_row_pitch, size_t *image_slice_pitch,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
    cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
    cl_command_queue command_queue, cl_uint num_mem_objects,
    const cl_mem *mem_objects, cl_mem_migration_flags flags,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clEnqueueMigrateMemObjects;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
    cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
    const size_t *global_work_offset, const size_t *global_work_size,
    const size_t *local_work_size, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
    cl_command_queue command_queue, cl_kernel kernel,
    cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
    cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
    void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
    const void **args_mem_loc, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
    cl_command_queue command_queue, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY void *(
    CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(
    cl_platform_id platform,
    const char *function_name)CL_API_SUFFIX__VERSION_1_2;

#else

typedef void *cl_api_clEnqueueMarkerWithWaitList;
typedef void *cl_api_clEnqueueBarrierWithWaitList;
typedef void *cl_api_clGetExtensionFunctionAddressForPlatform;

#endif

// Shared Virtual Memory APIs

#ifdef CL_VERSION_2_0

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
    cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
    void ** /* svm_pointers */,
    void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,
                                     cl_uint /* num_svm_pointers */,
                                     void ** /* svm_pointers[] */,
                                     void * /* user_data */),
    void * /* user_data */, cl_uint /* num_events_in_wait_list */,
    const cl_event * /* event_wait_list */,
    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
    cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
    void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
    cl_uint /* num_events_in_wait_list */,
    const cl_event * /* event_wait_list */,
    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
    cl_command_queue /* command_queue */, void * /* svm_ptr */,
    const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
    cl_uint /* num_events_in_wait_list */,
    const cl_event * /* event_wait_list */,
    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
    cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
    cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,
    cl_uint /* num_events_in_wait_list */,
    const cl_event * /* event_wait_list */,
    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
    cl_command_queue /* command_queue */, void * /* svm_ptr */,
    cl_uint /* num_events_in_wait_list */,
    const cl_event * /* event_wait_list */,
    cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;

#else

typedef void *cl_api_clEnqueueSVMFree;
typedef void *cl_api_clEnqueueSVMMemcpy;
typedef void *cl_api_clEnqueueSVMMemFill;
typedef void *cl_api_clEnqueueSVMMap;
typedef void *cl_api_clEnqueueSVMUnmap;

#endif

// Deprecated APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
    cl_command_queue command_queue, cl_command_queue_properties properties,
    cl_bool enable, cl_command_queue_properties *old_properties)
    CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
    size_t image_width, size_t image_height, size_t image_row_pitch,
    void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
    cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
    size_t image_width, size_t image_height, size_t image_depth,
    size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,
    cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
    CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
    cl_command_queue command_queue,
    cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
    cl_command_queue command_queue, cl_uint num_events,
    const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
    cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
    const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;

// GL and other APIs
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
    cl_context context, cl_mem_flags flags, cl_GLuint bufobj,
    int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
    cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
    cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
    cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
    cl_mem memobj, cl_gl_object_type *gl_object_type,
    cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
    cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

/* cl_khr_gl_sharing */
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
    const cl_context_properties *properties, cl_gl_context_info param_name,
    size_t param_value_size, void *param_value, size_t *param_value_size_ret);

/* cl_khr_gl_event */
typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
    cl_context context, cl_GLsync sync, cl_int *errcode_ret);

#if defined(_WIN32)

/* cl_khr_d3d10_sharing */

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
    cl_uint num_entries, cl_device_id *devices,
    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
    cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR(
    cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
    void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags,
                           ID3D10Buffer *resource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR(
    cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
    UINT subresource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR(
    cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
    UINT subresource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

/* cl_khr_d3d11_sharing */
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
    cl_uint num_entries, cl_device_id *devices,
    cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
    cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
    UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

/* cl_khr_dx9_media_sharing */
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(
    cl_platform_id platform, cl_uint num_media_adapters,
    cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,
    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
    cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
    cl_context context, cl_mem_flags flags,
    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
    cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_1_2;

/* cl_khr_d3d11_sharing */
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR(
    cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
    void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
    cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags,
                           ID3D11Buffer *resource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR(
    cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
    UINT subresource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR(
    cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
    UINT subresource, cl_int *errcode_ret);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

/* cl_khr_dx9_media_sharing */
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR(
    cl_platform_id platform, cl_uint num_media_adapters,
    cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters,
    cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
    cl_device_id *devices, cl_uint *num_devices);

extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR(
    cl_context context, cl_mem_flags flags,
    cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
    cl_uint plane, cl_int *errcode_ret);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

#else

/* cl_khr_d3d10_sharing */
typedef void *cl_api_clGetDeviceIDsFromD3D10KHR;
typedef void *cl_api_clCreateFromD3D10BufferKHR;
typedef void *cl_api_clCreateFromD3D10Texture2DKHR;
typedef void *cl_api_clCreateFromD3D10Texture3DKHR;
typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR;
typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR;

/* cl_khr_d3d11_sharing */
typedef void *cl_api_clGetDeviceIDsFromD3D11KHR;
typedef void *cl_api_clCreateFromD3D11BufferKHR;
typedef void *cl_api_clCreateFromD3D11Texture2DKHR;
typedef void *cl_api_clCreateFromD3D11Texture3DKHR;
typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR;
typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR;

/* cl_khr_dx9_media_sharing */
typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR;
typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR;
typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR;
typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;

#endif

/* OpenCL 1.1 */

#ifdef CL_VERSION_1_1

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
    cl_event /* event */, cl_int /* command_exec_callback_type */,
    void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
    void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
    cl_mem /* buffer */, cl_mem_flags /* flags */,
    cl_buffer_create_type /* buffer_create_type */,
    const void * /* buffer_create_info */,
    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(
    cl_mem /* memobj */,
    void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
                                       void * /*user_data*/),
    void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
    cl_context /* context */,
    cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
    cl_event /* event */,
    cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;

#else

typedef void *cl_api_clSetEventCallback;
typedef void *cl_api_clCreateSubBuffer;
typedef void *cl_api_clSetMemObjectDestructorCallback;
typedef void *cl_api_clCreateUserEvent;
typedef void *cl_api_clSetUserEventStatus;

#endif

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
    cl_device_id in_device,
    const cl_device_partition_property_ext *partition_properties,
    cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_0;

/* cl_khr_egl_image */
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
    cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,
    cl_mem_flags flags, const cl_egl_image_properties_khr *properties,
    cl_int *errcode_ret);

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list, cl_event *event);

/* cl_khr_egl_event */
typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
    cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,
    cl_int *errcode_ret);

#ifdef CL_VERSION_2_1

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
    cl_context context, cl_device_id device,
    cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
    cl_context context, const void *il, size_t length,
    cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
    cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
    size_t input_value_size, const void *input_value, size_t param_value_size,
    void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
    cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
    cl_command_queue command_queue, cl_uint num_svm_pointers,
    const void **svm_pointers, const size_t *sizes,
    cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
    cl_device_id device, cl_ulong *device_timestamp,
    cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;

typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
    cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;

#else

typedef void *cl_api_clSetDefaultDeviceCommandQueue;
typedef void *cl_api_clCreateProgramWithIL;
typedef void *cl_api_clGetKernelSubGroupInfo;
typedef void *cl_api_clCloneKernel;
typedef void *cl_api_clEnqueueSVMMigrateMem;
typedef void *cl_api_clGetDeviceAndHostTimer;
typedef void *cl_api_clGetHostTimer;

#endif

/* Vendor dispatch table struture */

typedef struct _cl_icd_dispatch {
  /* OpenCL 1.0 */
  cl_api_clGetPlatformIDs clGetPlatformIDs;
  cl_api_clGetPlatformInfo clGetPlatformInfo;
  cl_api_clGetDeviceIDs clGetDeviceIDs;
  cl_api_clGetDeviceInfo clGetDeviceInfo;
  cl_api_clCreateContext clCreateContext;
  cl_api_clCreateContextFromType clCreateContextFromType;
  cl_api_clRetainContext clRetainContext;
  cl_api_clReleaseContext clReleaseContext;
  cl_api_clGetContextInfo clGetContextInfo;
  cl_api_clCreateCommandQueue clCreateCommandQueue;
  cl_api_clRetainCommandQueue clRetainCommandQueue;
  cl_api_clReleaseCommandQueue clReleaseCommandQueue;
  cl_api_clGetCommandQueueInfo clGetCommandQueueInfo;
  cl_api_clSetCommandQueueProperty clSetCommandQueueProperty;
  cl_api_clCreateBuffer clCreateBuffer;
  cl_api_clCreateImage2D clCreateImage2D;
  cl_api_clCreateImage3D clCreateImage3D;
  cl_api_clRetainMemObject clRetainMemObject;
  cl_api_clReleaseMemObject clReleaseMemObject;
  cl_api_clGetSupportedImageFormats clGetSupportedImageFormats;
  cl_api_clGetMemObjectInfo clGetMemObjectInfo;
  cl_api_clGetImageInfo clGetImageInfo;
  cl_api_clCreateSampler clCreateSampler;
  cl_api_clRetainSampler clRetainSampler;
  cl_api_clReleaseSampler clReleaseSampler;
  cl_api_clGetSamplerInfo clGetSamplerInfo;
  cl_api_clCreateProgramWithSource clCreateProgramWithSource;
  cl_api_clCreateProgramWithBinary clCreateProgramWithBinary;
  cl_api_clRetainProgram clRetainProgram;
  cl_api_clReleaseProgram clReleaseProgram;
  cl_api_clBuildProgram clBuildProgram;
  cl_api_clUnloadCompiler clUnloadCompiler;
  cl_api_clGetProgramInfo clGetProgramInfo;
  cl_api_clGetProgramBuildInfo clGetProgramBuildInfo;
  cl_api_clCreateKernel clCreateKernel;
  cl_api_clCreateKernelsInProgram clCreateKernelsInProgram;
  cl_api_clRetainKernel clRetainKernel;
  cl_api_clReleaseKernel clReleaseKernel;
  cl_api_clSetKernelArg clSetKernelArg;
  cl_api_clGetKernelInfo clGetKernelInfo;
  cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
  cl_api_clWaitForEvents clWaitForEvents;
  cl_api_clGetEventInfo clGetEventInfo;
  cl_api_clRetainEvent clRetainEvent;
  cl_api_clReleaseEvent clReleaseEvent;
  cl_api_clGetEventProfilingInfo clGetEventProfilingInfo;
  cl_api_clFlush clFlush;
  cl_api_clFinish clFinish;
  cl_api_clEnqueueReadBuffer clEnqueueReadBuffer;
  cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer;
  cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer;
  cl_api_clEnqueueReadImage clEnqueueReadImage;
  cl_api_clEnqueueWriteImage clEnqueueWriteImage;
  cl_api_clEnqueueCopyImage clEnqueueCopyImage;
  cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
  cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
  cl_api_clEnqueueMapBuffer clEnqueueMapBuffer;
  cl_api_clEnqueueMapImage clEnqueueMapImage;
  cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
  cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
  cl_api_clEnqueueTask clEnqueueTask;
  cl_api_clEnqueueNativeKernel clEnqueueNativeKernel;
  cl_api_clEnqueueMarker clEnqueueMarker;
  cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents;
  cl_api_clEnqueueBarrier clEnqueueBarrier;
  cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
  cl_api_clCreateFromGLBuffer clCreateFromGLBuffer;
  cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D;
  cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D;
  cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer;
  cl_api_clGetGLObjectInfo clGetGLObjectInfo;
  cl_api_clGetGLTextureInfo clGetGLTextureInfo;
  cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
  cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
  cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR;

  /* cl_khr_d3d10_sharing */
  cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR;
  cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR;
  cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR;
  cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR;
  cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR;
  cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR;

  /* OpenCL 1.1 */
  cl_api_clSetEventCallback clSetEventCallback;
  cl_api_clCreateSubBuffer clCreateSubBuffer;
  cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
  cl_api_clCreateUserEvent clCreateUserEvent;
  cl_api_clSetUserEventStatus clSetUserEventStatus;
  cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect;
  cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
  cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;

  /* cl_ext_device_fission */
  cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT;
  cl_api_clRetainDeviceEXT clRetainDeviceEXT;
  cl_api_clReleaseDeviceEXT clReleaseDeviceEXT;

  /* cl_khr_gl_event */
  cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR;

  /* OpenCL 1.2 */
  cl_api_clCreateSubDevices clCreateSubDevices;
  cl_api_clRetainDevice clRetainDevice;
  cl_api_clReleaseDevice clReleaseDevice;
  cl_api_clCreateImage clCreateImage;
  cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
  cl_api_clCompileProgram clCompileProgram;
  cl_api_clLinkProgram clLinkProgram;
  cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler;
  cl_api_clGetKernelArgInfo clGetKernelArgInfo;
  cl_api_clEnqueueFillBuffer clEnqueueFillBuffer;
  cl_api_clEnqueueFillImage clEnqueueFillImage;
  cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
  cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
  cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
  cl_api_clGetExtensionFunctionAddressForPlatform
      clGetExtensionFunctionAddressForPlatform;
  cl_api_clCreateFromGLTexture clCreateFromGLTexture;

  /* cl_khr_d3d11_sharing */
  cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR;
  cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR;
  cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR;
  cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR;
  cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR;
  cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR;
  cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR;

  /* cl_khr_dx9_media_sharing */
  cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR
      clGetDeviceIDsFromDX9MediaAdapterKHR;
  cl_api_clEnqueueAcquireDX9MediaSurfacesKHR
      clEnqueueAcquireDX9MediaSurfacesKHR;
  cl_api_clEnqueueReleaseDX9MediaSurfacesKHR
      clEnqueueReleaseDX9MediaSurfacesKHR;

  /* cl_khr_egl_image */
  cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
  cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
  cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;

  /* cl_khr_egl_event */
  cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;

  /* OpenCL 2.0 */
  cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
  cl_api_clCreatePipe clCreatePipe;
  cl_api_clGetPipeInfo clGetPipeInfo;
  cl_api_clSVMAlloc clSVMAlloc;
  cl_api_clSVMFree clSVMFree;
  cl_api_clEnqueueSVMFree clEnqueueSVMFree;
  cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
  cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill;
  cl_api_clEnqueueSVMMap clEnqueueSVMMap;
  cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap;
  cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties;
  cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
  cl_api_clSetKernelExecInfo clSetKernelExecInfo;

  /* cl_khr_sub_groups */
  cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR;

  /* OpenCL 2.1 */
  cl_api_clCloneKernel clCloneKernel;
  cl_api_clCreateProgramWithIL clCreateProgramWithIL;
  cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem;
  cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer;
  cl_api_clGetHostTimer clGetHostTimer;
  cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo;
  cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue;

  /* OpenCL 2.2 */
  cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback;
  cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant;

  /* OpenCL 3.0 */
  cl_api_clCreateBufferWithProperties clCreateBufferWithProperties;
  cl_api_clCreateImageWithProperties clCreateImageWithProperties;

} cl_icd_dispatch;

#ifdef __cplusplus
}
#endif

#endif /* #ifndef OPENCL_CL_ICD_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_platform.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __CL_PLATFORM_H
#define __CL_PLATFORM_H

#include <CL/cl_version.h>

#ifdef __cplusplus
extern "C" {
#endif

#if defined(_WIN32)
    #define CL_API_ENTRY
    #define CL_API_CALL     __stdcall
    #define CL_CALLBACK     __stdcall
#else
    #define CL_API_ENTRY
    #define CL_API_CALL
    #define CL_CALLBACK
#endif

/*
 * Deprecation flags refer to the last version of the header in which the
 * feature was not deprecated.
 *
 * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
 * deprecation but is deprecated in versions later than 1.1.
 */

#define CL_EXTENSION_WEAK_LINK
#define CL_API_SUFFIX__VERSION_1_0
#define CL_EXT_SUFFIX__VERSION_1_0
#define CL_API_SUFFIX__VERSION_1_1
#define CL_EXT_SUFFIX__VERSION_1_1
#define CL_API_SUFFIX__VERSION_1_2
#define CL_EXT_SUFFIX__VERSION_1_2
#define CL_API_SUFFIX__VERSION_2_0
#define CL_EXT_SUFFIX__VERSION_2_0
#define CL_API_SUFFIX__VERSION_2_1
#define CL_EXT_SUFFIX__VERSION_2_1
#define CL_API_SUFFIX__VERSION_2_2
#define CL_EXT_SUFFIX__VERSION_2_2
#define CL_API_SUFFIX__VERSION_3_0
#define CL_EXT_SUFFIX__VERSION_3_0
#define CL_API_SUFFIX__EXPERIMENTAL
#define CL_EXT_SUFFIX__EXPERIMENTAL


#ifdef __GNUC__
  #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
  #define CL_EXT_PREFIX_DEPRECATED
#elif defined(_WIN32)
  #define CL_EXT_SUFFIX_DEPRECATED
  #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
#else
  #define CL_EXT_SUFFIX_DEPRECATED
  #define CL_EXT_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
 #endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
#else
    #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
    #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif

#if (defined (_WIN32) && defined(_MSC_VER))

/* scalar types  */
typedef signed   __int8         cl_char;
typedef unsigned __int8         cl_uchar;
typedef signed   __int16        cl_short;
typedef unsigned __int16        cl_ushort;
typedef signed   __int32        cl_int;
typedef unsigned __int32        cl_uint;
typedef signed   __int64        cl_long;
typedef unsigned __int64        cl_ulong;

typedef unsigned __int16        cl_half;
typedef float                   cl_float;
typedef double                  cl_double;

/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT         8
#define CL_SCHAR_MAX        127
#define CL_SCHAR_MIN        (-127-1)
#define CL_CHAR_MAX         CL_SCHAR_MAX
#define CL_CHAR_MIN         CL_SCHAR_MIN
#define CL_UCHAR_MAX        255
#define CL_SHRT_MAX         32767
#define CL_SHRT_MIN         (-32767-1)
#define CL_USHRT_MAX        65535
#define CL_INT_MAX          2147483647
#define CL_INT_MIN          (-2147483647-1)
#define CL_UINT_MAX         0xffffffffU
#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)

#define CL_FLT_DIG          6
#define CL_FLT_MANT_DIG     24
#define CL_FLT_MAX_10_EXP   +38
#define CL_FLT_MAX_EXP      +128
#define CL_FLT_MIN_10_EXP   -37
#define CL_FLT_MIN_EXP      -125
#define CL_FLT_RADIX        2
#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
#define CL_FLT_MIN          1.175494350822287507969e-38f
#define CL_FLT_EPSILON      1.1920928955078125e-7f

#define CL_HALF_DIG          3
#define CL_HALF_MANT_DIG     11
#define CL_HALF_MAX_10_EXP   +4
#define CL_HALF_MAX_EXP      +16
#define CL_HALF_MIN_10_EXP   -4
#define CL_HALF_MIN_EXP      -13
#define CL_HALF_RADIX        2
#define CL_HALF_MAX          65504.0f
#define CL_HALF_MIN          6.103515625e-05f
#define CL_HALF_EPSILON      9.765625e-04f

#define CL_DBL_DIG          15
#define CL_DBL_MANT_DIG     53
#define CL_DBL_MAX_10_EXP   +308
#define CL_DBL_MAX_EXP      +1024
#define CL_DBL_MIN_10_EXP   -307
#define CL_DBL_MIN_EXP      -1021
#define CL_DBL_RADIX        2
#define CL_DBL_MAX          1.7976931348623158e+308
#define CL_DBL_MIN          2.225073858507201383090e-308
#define CL_DBL_EPSILON      2.220446049250313080847e-16

#define CL_M_E              2.7182818284590452354
#define CL_M_LOG2E          1.4426950408889634074
#define CL_M_LOG10E         0.43429448190325182765
#define CL_M_LN2            0.69314718055994530942
#define CL_M_LN10           2.30258509299404568402
#define CL_M_PI             3.14159265358979323846
#define CL_M_PI_2           1.57079632679489661923
#define CL_M_PI_4           0.78539816339744830962
#define CL_M_1_PI           0.31830988618379067154
#define CL_M_2_PI           0.63661977236758134308
#define CL_M_2_SQRTPI       1.12837916709551257390
#define CL_M_SQRT2          1.41421356237309504880
#define CL_M_SQRT1_2        0.70710678118654752440

#define CL_M_E_F            2.718281828f
#define CL_M_LOG2E_F        1.442695041f
#define CL_M_LOG10E_F       0.434294482f
#define CL_M_LN2_F          0.693147181f
#define CL_M_LN10_F         2.302585093f
#define CL_M_PI_F           3.141592654f
#define CL_M_PI_2_F         1.570796327f
#define CL_M_PI_4_F         0.785398163f
#define CL_M_1_PI_F         0.318309886f
#define CL_M_2_PI_F         0.636619772f
#define CL_M_2_SQRTPI_F     1.128379167f
#define CL_M_SQRT2_F        1.414213562f
#define CL_M_SQRT1_2_F      0.707106781f

#define CL_NAN              (CL_INFINITY - CL_INFINITY)
#define CL_HUGE_VALF        ((cl_float) 1e50)
#define CL_HUGE_VAL         ((cl_double) 1e500)
#define CL_MAXFLOAT         CL_FLT_MAX
#define CL_INFINITY         CL_HUGE_VALF

#else

#include <stdint.h>

/* scalar types  */
typedef int8_t          cl_char;
typedef uint8_t         cl_uchar;
typedef int16_t         cl_short;
typedef uint16_t        cl_ushort;
typedef int32_t         cl_int;
typedef uint32_t        cl_uint;
typedef int64_t         cl_long;
typedef uint64_t        cl_ulong;

typedef uint16_t        cl_half;
typedef float           cl_float;
typedef double          cl_double;

/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT         8
#define CL_SCHAR_MAX        127
#define CL_SCHAR_MIN        (-127-1)
#define CL_CHAR_MAX         CL_SCHAR_MAX
#define CL_CHAR_MIN         CL_SCHAR_MIN
#define CL_UCHAR_MAX        255
#define CL_SHRT_MAX         32767
#define CL_SHRT_MIN         (-32767-1)
#define CL_USHRT_MAX        65535
#define CL_INT_MAX          2147483647
#define CL_INT_MIN          (-2147483647-1)
#define CL_UINT_MAX         0xffffffffU
#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)

#define CL_FLT_DIG          6
#define CL_FLT_MANT_DIG     24
#define CL_FLT_MAX_10_EXP   +38
#define CL_FLT_MAX_EXP      +128
#define CL_FLT_MIN_10_EXP   -37
#define CL_FLT_MIN_EXP      -125
#define CL_FLT_RADIX        2
#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
#define CL_FLT_MIN          1.175494350822287507969e-38f
#define CL_FLT_EPSILON      1.1920928955078125e-7f

#define CL_HALF_DIG          3
#define CL_HALF_MANT_DIG     11
#define CL_HALF_MAX_10_EXP   +4
#define CL_HALF_MAX_EXP      +16
#define CL_HALF_MIN_10_EXP   -4
#define CL_HALF_MIN_EXP      -13
#define CL_HALF_RADIX        2
#define CL_HALF_MAX          65504.0f
#define CL_HALF_MIN          6.103515625e-05f
#define CL_HALF_EPSILON      9.765625e-04f

#define CL_DBL_DIG          15
#define CL_DBL_MANT_DIG     53
#define CL_DBL_MAX_10_EXP   +308
#define CL_DBL_MAX_EXP      +1024
#define CL_DBL_MIN_10_EXP   -307
#define CL_DBL_MIN_EXP      -1021
#define CL_DBL_RADIX        2
#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
#define CL_DBL_MIN          2.225073858507201383090e-308
#define CL_DBL_EPSILON      2.220446049250313080847e-16

#define CL_M_E              2.7182818284590452354
#define CL_M_LOG2E          1.4426950408889634074
#define CL_M_LOG10E         0.43429448190325182765
#define CL_M_LN2            0.69314718055994530942
#define CL_M_LN10           2.30258509299404568402
#define CL_M_PI             3.14159265358979323846
#define CL_M_PI_2           1.57079632679489661923
#define CL_M_PI_4           0.78539816339744830962
#define CL_M_1_PI           0.31830988618379067154
#define CL_M_2_PI           0.63661977236758134308
#define CL_M_2_SQRTPI       1.12837916709551257390
#define CL_M_SQRT2          1.41421356237309504880
#define CL_M_SQRT1_2        0.70710678118654752440

#define CL_M_E_F            2.718281828f
#define CL_M_LOG2E_F        1.442695041f
#define CL_M_LOG10E_F       0.434294482f
#define CL_M_LN2_F          0.693147181f
#define CL_M_LN10_F         2.302585093f
#define CL_M_PI_F           3.141592654f
#define CL_M_PI_2_F         1.570796327f
#define CL_M_PI_4_F         0.785398163f
#define CL_M_1_PI_F         0.318309886f
#define CL_M_2_PI_F         0.636619772f
#define CL_M_2_SQRTPI_F     1.128379167f
#define CL_M_SQRT2_F        1.414213562f
#define CL_M_SQRT1_2_F      0.707106781f

#if defined( __GNUC__ )
   #define CL_HUGE_VALF     __builtin_huge_valf()
   #define CL_HUGE_VAL      __builtin_huge_val()
   #define CL_NAN           __builtin_nanf( "" )
#else
   #define CL_HUGE_VALF     ((cl_float) 1e50)
   #define CL_HUGE_VAL      ((cl_double) 1e500)
   float nanf( const char * );
   #define CL_NAN           nanf( "" )
#endif
#define CL_MAXFLOAT         CL_FLT_MAX
#define CL_INFINITY         CL_HUGE_VALF

#endif

#include <stddef.h>

/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
typedef unsigned int cl_GLuint;
typedef int          cl_GLint;
typedef unsigned int cl_GLenum;

/*
 * Vector types
 *
 *  Note:   OpenCL requires that all types be naturally aligned.
 *          This means that vector types must be naturally aligned.
 *          For example, a vector of four floats must be aligned to
 *          a 16 byte boundary (calculated as 4 * the natural 4-byte
 *          alignment of the float).  The alignment qualifiers here
 *          will only function properly if your compiler supports them
 *          and if you don't actively work to defeat them.  For example,
 *          in order for a cl_float4 to be 16 byte aligned in a struct,
 *          the start of the struct must itself be 16-byte aligned.
 *
 *          Maintaining proper alignment is the user's responsibility.
 */

/* Define basic vector types */
#if defined( __VEC__ )
   #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
   typedef __vector unsigned char     __cl_uchar16;
   typedef __vector signed char       __cl_char16;
   typedef __vector unsigned short    __cl_ushort8;
   typedef __vector signed short      __cl_short8;
   typedef __vector unsigned int      __cl_uint4;
   typedef __vector signed int        __cl_int4;
   typedef __vector float             __cl_float4;
   #define  __CL_UCHAR16__  1
   #define  __CL_CHAR16__   1
   #define  __CL_USHORT8__  1
   #define  __CL_SHORT8__   1
   #define  __CL_UINT4__    1
   #define  __CL_INT4__     1
   #define  __CL_FLOAT4__   1
#endif

#if defined( __SSE__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <xmmintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef float __cl_float4   __attribute__((vector_size(16)));
    #else
        typedef __m128 __cl_float4;
    #endif
    #define __CL_FLOAT4__   1
#endif

#if defined( __SSE2__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <emmintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
    #else
        typedef __m128i __cl_uchar16;
        typedef __m128i __cl_char16;
        typedef __m128i __cl_ushort8;
        typedef __m128i __cl_short8;
        typedef __m128i __cl_uint4;
        typedef __m128i __cl_int4;
        typedef __m128i __cl_ulong2;
        typedef __m128i __cl_long2;
        typedef __m128d __cl_double2;
    #endif
    #define __CL_UCHAR16__  1
    #define __CL_CHAR16__   1
    #define __CL_USHORT8__  1
    #define __CL_SHORT8__   1
    #define __CL_INT4__     1
    #define __CL_UINT4__    1
    #define __CL_ULONG2__   1
    #define __CL_LONG2__    1
    #define __CL_DOUBLE2__  1
#endif

#if defined( __MMX__ )
    #include <mmintrin.h>
    #if defined( __GNUC__ )
        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
    #else
        typedef __m64       __cl_uchar8;
        typedef __m64       __cl_char8;
        typedef __m64       __cl_ushort4;
        typedef __m64       __cl_short4;
        typedef __m64       __cl_uint2;
        typedef __m64       __cl_int2;
        typedef __m64       __cl_ulong1;
        typedef __m64       __cl_long1;
        typedef __m64       __cl_float2;
    #endif
    #define __CL_UCHAR8__   1
    #define __CL_CHAR8__    1
    #define __CL_USHORT4__  1
    #define __CL_SHORT4__   1
    #define __CL_INT2__     1
    #define __CL_UINT2__    1
    #define __CL_ULONG1__   1
    #define __CL_LONG1__    1
    #define __CL_FLOAT2__   1
#endif

#if defined( __AVX__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <immintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
    #else
        typedef __m256      __cl_float8;
        typedef __m256d     __cl_double4;
    #endif
    #define __CL_FLOAT8__   1
    #define __CL_DOUBLE4__  1
#endif

/* Define capabilities for anonymous struct members. */
#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__
#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__ __extension__
#elif defined( _WIN32) && defined(_MSC_VER)
    #if _MSC_VER >= 1500
   /* Microsoft Developer Studio 2008 supports anonymous structs, but
    * complains by default. */
    #define  __CL_HAS_ANON_STRUCT__ 1
    #define  __CL_ANON_STRUCT__
   /* Disable warning C4201: nonstandard extension used : nameless
    * struct/union */
    #pragma warning( push )
    #pragma warning( disable : 4201 )
    #endif
#else
#define  __CL_HAS_ANON_STRUCT__ 0
#define  __CL_ANON_STRUCT__
#endif

/* Define alignment keys */
#if defined( __GNUC__ )
    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
#elif defined( _WIN32) && (_MSC_VER)
    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
    /* #include <crtdefs.h>                                                                                             */
    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
    #define CL_ALIGNED(_x)
#else
   #warning  Need to implement some method to align data here
   #define  CL_ALIGNED(_x)
#endif

/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
#if __CL_HAS_ANON_STRUCT__
    /* .xyzw and .s0123...{f|F} are supported */
    #define CL_HAS_NAMED_VECTOR_FIELDS 1
    /* .hi and .lo are supported */
    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
#endif

/* Define cl_vector types */

/* ---- cl_charn ---- */
typedef union
{
    cl_char  CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2;
#endif
}cl_char2;

typedef union
{
    cl_char  CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[2];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4;
#endif
}cl_char4;

/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
typedef  cl_char4  cl_char3;

typedef union
{
    cl_char   CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[4];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4[2];
#endif
#if defined( __CL_CHAR8__ )
    __cl_char8     v8;
#endif
}cl_char8;

typedef union
{
    cl_char  CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[8];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4[4];
#endif
#if defined( __CL_CHAR8__ )
    __cl_char8     v8[2];
#endif
#if defined( __CL_CHAR16__ )
    __cl_char16    v16;
#endif
}cl_char16;


/* ---- cl_ucharn ---- */
typedef union
{
    cl_uchar  CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
#endif
#if defined( __cl_uchar2__)
    __cl_uchar2     v2;
#endif
}cl_uchar2;

typedef union
{
    cl_uchar  CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[2];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4;
#endif
}cl_uchar4;

/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
typedef  cl_uchar4  cl_uchar3;

typedef union
{
    cl_uchar   CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[4];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4[2];
#endif
#if defined( __CL_UCHAR8__ )
    __cl_uchar8     v8;
#endif
}cl_uchar8;

typedef union
{
    cl_uchar  CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[8];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4[4];
#endif
#if defined( __CL_UCHAR8__ )
    __cl_uchar8     v8[2];
#endif
#if defined( __CL_UCHAR16__ )
    __cl_uchar16    v16;
#endif
}cl_uchar16;


/* ---- cl_shortn ---- */
typedef union
{
    cl_short  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2;
#endif
}cl_short2;

typedef union
{
    cl_short  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[2];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4;
#endif
}cl_short4;

/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
typedef  cl_short4  cl_short3;

typedef union
{
    cl_short   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[4];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4[2];
#endif
#if defined( __CL_SHORT8__ )
    __cl_short8     v8;
#endif
}cl_short8;

typedef union
{
    cl_short  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[8];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4[4];
#endif
#if defined( __CL_SHORT8__ )
    __cl_short8     v8[2];
#endif
#if defined( __CL_SHORT16__ )
    __cl_short16    v16;
#endif
}cl_short16;


/* ---- cl_ushortn ---- */
typedef union
{
    cl_ushort  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2;
#endif
}cl_ushort2;

typedef union
{
    cl_ushort  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[2];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4;
#endif
}cl_ushort4;

/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
typedef  cl_ushort4  cl_ushort3;

typedef union
{
    cl_ushort   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[4];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4[2];
#endif
#if defined( __CL_USHORT8__ )
    __cl_ushort8     v8;
#endif
}cl_ushort8;

typedef union
{
    cl_ushort  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[8];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4[4];
#endif
#if defined( __CL_USHORT8__ )
    __cl_ushort8     v8[2];
#endif
#if defined( __CL_USHORT16__ )
    __cl_ushort16    v16;
#endif
}cl_ushort16;


/* ---- cl_halfn ---- */
typedef union
{
    cl_half  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2;
#endif
}cl_half2;

typedef union
{
    cl_half  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[2];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4;
#endif
}cl_half4;

/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
typedef  cl_half4  cl_half3;

typedef union
{
    cl_half   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[4];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4[2];
#endif
#if defined( __CL_HALF8__ )
    __cl_half8     v8;
#endif
}cl_half8;

typedef union
{
    cl_half  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[8];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4[4];
#endif
#if defined( __CL_HALF8__ )
    __cl_half8     v8[2];
#endif
#if defined( __CL_HALF16__ )
    __cl_half16    v16;
#endif
}cl_half16;

/* ---- cl_intn ---- */
typedef union
{
    cl_int  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2;
#endif
}cl_int2;

typedef union
{
    cl_int  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[2];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4;
#endif
}cl_int4;

/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
typedef  cl_int4  cl_int3;

typedef union
{
    cl_int   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[4];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4[2];
#endif
#if defined( __CL_INT8__ )
    __cl_int8     v8;
#endif
}cl_int8;

typedef union
{
    cl_int  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[8];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4[4];
#endif
#if defined( __CL_INT8__ )
    __cl_int8     v8[2];
#endif
#if defined( __CL_INT16__ )
    __cl_int16    v16;
#endif
}cl_int16;


/* ---- cl_uintn ---- */
typedef union
{
    cl_uint  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2;
#endif
}cl_uint2;

typedef union
{
    cl_uint  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[2];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4;
#endif
}cl_uint4;

/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
typedef  cl_uint4  cl_uint3;

typedef union
{
    cl_uint   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[4];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4[2];
#endif
#if defined( __CL_UINT8__ )
    __cl_uint8     v8;
#endif
}cl_uint8;

typedef union
{
    cl_uint  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[8];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4[4];
#endif
#if defined( __CL_UINT8__ )
    __cl_uint8     v8[2];
#endif
#if defined( __CL_UINT16__ )
    __cl_uint16    v16;
#endif
}cl_uint16;

/* ---- cl_longn ---- */
typedef union
{
    cl_long  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2;
#endif
}cl_long2;

typedef union
{
    cl_long  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[2];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4;
#endif
}cl_long4;

/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
typedef  cl_long4  cl_long3;

typedef union
{
    cl_long   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[4];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4[2];
#endif
#if defined( __CL_LONG8__ )
    __cl_long8     v8;
#endif
}cl_long8;

typedef union
{
    cl_long  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[8];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4[4];
#endif
#if defined( __CL_LONG8__ )
    __cl_long8     v8[2];
#endif
#if defined( __CL_LONG16__ )
    __cl_long16    v16;
#endif
}cl_long16;


/* ---- cl_ulongn ---- */
typedef union
{
    cl_ulong  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2;
#endif
}cl_ulong2;

typedef union
{
    cl_ulong  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[2];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4;
#endif
}cl_ulong4;

/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
typedef  cl_ulong4  cl_ulong3;

typedef union
{
    cl_ulong   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[4];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4[2];
#endif
#if defined( __CL_ULONG8__ )
    __cl_ulong8     v8;
#endif
}cl_ulong8;

typedef union
{
    cl_ulong  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[8];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4[4];
#endif
#if defined( __CL_ULONG8__ )
    __cl_ulong8     v8[2];
#endif
#if defined( __CL_ULONG16__ )
    __cl_ulong16    v16;
#endif
}cl_ulong16;


/* --- cl_floatn ---- */

typedef union
{
    cl_float  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2;
#endif
}cl_float2;

typedef union
{
    cl_float  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[2];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4;
#endif
}cl_float4;

/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
typedef  cl_float4  cl_float3;

typedef union
{
    cl_float   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[4];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4[2];
#endif
#if defined( __CL_FLOAT8__ )
    __cl_float8     v8;
#endif
}cl_float8;

typedef union
{
    cl_float  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[8];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4[4];
#endif
#if defined( __CL_FLOAT8__ )
    __cl_float8     v8[2];
#endif
#if defined( __CL_FLOAT16__ )
    __cl_float16    v16;
#endif
}cl_float16;

/* --- cl_doublen ---- */

typedef union
{
    cl_double  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2;
#endif
}cl_double2;

typedef union
{
    cl_double  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[2];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4;
#endif
}cl_double4;

/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
typedef  cl_double4  cl_double3;

typedef union
{
    cl_double   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[4];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4[2];
#endif
#if defined( __CL_DOUBLE8__ )
    __cl_double8     v8;
#endif
}cl_double8;

typedef union
{
    cl_double  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[8];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4[4];
#endif
#if defined( __CL_DOUBLE8__ )
    __cl_double8     v8[2];
#endif
#if defined( __CL_DOUBLE16__ )
    __cl_double16    v16;
#endif
}cl_double16;

/* Macro to facilitate debugging
 * Usage:
 *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
 *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
 *   Each line thereafter of OpenCL C source must end with: \n\
 *   The last line ends in ";
 *
 *   Example:
 *
 *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
 *   kernel void foo( int a, float * b )             \n\
 *   {                                               \n\
 *      // my comment                                \n\
 *      *b[ get_global_id(0)] = a;                   \n\
 *   }                                               \n\
 *   ";
 *
 * This should correctly set up the line, (column) and file information for your source
 * string so you can do source level debugging.
 */
#define  __CL_STRINGIFY( _x )               # _x
#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"

#ifdef __cplusplus
}
#endif

#undef __CL_HAS_ANON_STRUCT__
#undef __CL_ANON_STRUCT__
#if defined( _WIN32) && defined(_MSC_VER)
    #if _MSC_VER >=1500
    #pragma warning( pop )
    #endif
#endif

#endif  /* __CL_PLATFORM_H  */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_va_api_media_sharing_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
/*****************************************************************************\

Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.

THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

File Name: cl_va_api_media_sharing_intel.h

Abstract:

Notes:

\*****************************************************************************/


#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H

#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <va/va.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************
* cl_intel_va_api_media_sharing extension *
*******************************************/

#define cl_intel_va_api_media_sharing 1

/* error codes */
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101

/* cl_va_api_device_source_intel */
#define CL_VA_API_DISPLAY_INTEL                             0x4094

/* cl_va_api_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096

/* cl_context_info */
#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097

/* cl_mem_info */
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098

/* cl_image_info */
#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B

typedef cl_uint cl_va_api_device_source_intel;
typedef cl_uint cl_va_api_device_set_intel;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
    cl_platform_id                platform,
    cl_va_api_device_source_intel media_adapter_type,
    void*                         media_adapter,
    cl_va_api_device_set_intel    media_adapter_set,
    cl_uint                       num_entries,
    cl_device_id*                 devices,
    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
    cl_platform_id                platform,
    cl_va_api_device_source_intel media_adapter_type,
    void*                         media_adapter,
    cl_va_api_device_set_intel    media_adapter_set,
    cl_uint                       num_entries,
    cl_device_id*                 devices,
    cl_uint*                      num_devices) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL(
    cl_context                    context,
    cl_mem_flags                  flags,
    VASurfaceID*                  surface,
    cl_uint                       plane,
    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
    cl_context                    context,
    cl_mem_flags                  flags,
    VASurfaceID*                  surface,
    cl_uint                       plane,
    cl_int*                       errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;

typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              command_queue,
    cl_uint                       num_objects,
    const cl_mem*                 mem_objects,
    cl_uint                       num_events_in_wait_list,
    const cl_event*               event_wait_list,
    cl_event*                     event) CL_EXT_SUFFIX__VERSION_1_2;

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_version.h
================================================
/*******************************************************************************
 * Copyright (c) 2018-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __CL_VERSION_H
#define __CL_VERSION_H

/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
#define CL_TARGET_OPENCL_VERSION 220
#endif
#if CL_TARGET_OPENCL_VERSION != 100 && \
    CL_TARGET_OPENCL_VERSION != 110 && \
    CL_TARGET_OPENCL_VERSION != 120 && \
    CL_TARGET_OPENCL_VERSION != 200 && \
    CL_TARGET_OPENCL_VERSION != 210 && \
    CL_TARGET_OPENCL_VERSION != 220 && \
    CL_TARGET_OPENCL_VERSION != 300
#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)")
#undef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 220
#endif


/* OpenCL Version */
#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
#define CL_VERSION_3_0  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
#define CL_VERSION_2_2  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
#define CL_VERSION_2_1  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
#define CL_VERSION_2_0  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
#define CL_VERSION_1_2  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
#define CL_VERSION_1_1  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
#define CL_VERSION_1_0  1
#endif

/* Allow deprecated APIs for older OpenCL versions. */
#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif

#endif  /* __CL_VERSION_H */


================================================
FILE: GpuMemLatency/OpenCL/include/CL/opencl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_H
#define __OPENCL_H

#ifdef __cplusplus
extern "C" {
#endif

#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_H   */


================================================
FILE: GpuMemLatency/atomic_test.c
================================================
#include "opencltest.h"

float int_atomic_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations,
    short local,
    uint32_t *time_ms)
{
    cl_int ret;
    cl_int result = 0;
    size_t global_item_size = 2;
    size_t local_item_size = 1;
    float latency;
    uint32_t time_diff_ms;
    uint32_t A = 0;

    if (local)
    {
        local_item_size = 2;
    }

    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
    clFinish(command_queue);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        latency = 0;
        goto cleanup;
    }
    clFinish(command_queue);
    time_diff_ms = end_timing();
    *time_ms = time_diff_ms;
    latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    return latency;
}

float c2c_atomic_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations)
{
    cl_int ret;
    cl_int result = 0;
    size_t global_item_size;
    size_t local_item_size = 1;
    float latency;
    uint32_t time_diff_ms;
    uint32_t A;

    cl_uint cuCount = getCuCount();
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ret);
    global_item_size = cuCount;

    float* result_arr = (float*)malloc(sizeof(float) * cuCount * cuCount);

    for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)
    {
        for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)
        {
            if (t1_idx == t2_idx) continue;
            fprintf(stderr, "Testing %d -> %d\n", t1_idx, t2_idx);
            A = 0;
            ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);
            ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
            clFinish(command_queue);
            clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
            clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
            clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
            clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&t1_idx);
            clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&t2_idx);

            start_timing();
            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
            if (ret != CL_SUCCESS)
            {
                fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
                latency = 0;
                goto cleanup;
            }
            clFinish(command_queue);
            time_diff_ms = end_timing();
            latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;
            fprintf(stderr, "%d -> %d: %f\n", t1_idx, t2_idx, latency);
            result_arr[t1_idx * cuCount + t2_idx] = latency;
        }
    }

    for (cl_int i = 0; i < cuCount; i++)
    {
        printf(",%d", i);
    }
    printf("\n");

    for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)
    {
        printf("%d", t1_idx);
        for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)
        {
            if (t1_idx == t2_idx) printf(",x");
            else printf(",%f", result_arr[t1_idx * cuCount + t2_idx]);
        }
        printf("\n");
    }

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(result_arr);
    return latency;
}

float int_atomic_add_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    size_t threads,
    size_t localsize)
{
    // Loop unroll factor
    const float opsPerIteration = 8.0f;
    cl_int ret;
    int64_t time_diff_ms = 0;
    float gOpsPerSec;
    uint32_t iterations = 7000;
    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * threads);
    for (int i = 0; i < threads; i++) A[i] = i;

    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * threads, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t) * threads, A, 0, NULL, NULL);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
    clFinish(command_queue);

    while (time_diff_ms < TARGET_TIME_MS / 2) {
        start_timing();
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &threads, &localsize, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
            gOpsPerSec = 0;
            goto int_atomic_add_test_end;
        }

        clFinish(command_queue);
        time_diff_ms = end_timing();
        float totalOps = (float)iterations * opsPerIteration * (float)threads;
        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
        fprintf(stderr, "GOPS: %f, elapsed time: %lld\n", gOpsPerSec, time_diff_ms);

        iterations = adjust_iterations(iterations, time_diff_ms);
        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
    }


int_atomic_add_test_end:
    clReleaseMemObject(a_mem_obj);
    free(A);
    return gOpsPerSec;
}

================================================
FILE: GpuMemLatency/bw_test.c
================================================
#include "opencltest.h"

float bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint64_t list_size,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t skip,
    uint32_t chase_iterations)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float bandwidth, total_data_gb;
    cl_int ret;
    cl_int float4size = list_size / 4;
    int64_t time_diff_ms;

    if (skip == 0)
    {
        // nemes's read-combining-defeating heuristic
        uint32_t region_size = list_size * sizeof(float);
        uint32_t current_region_steps = (uint32_t)(region_size / (local_size * 4));
        skip = (chase_iterations + current_region_steps + 1) * local_size * 4;
    }

    float* A = (float*)malloc(sizeof(float) * list_size);
    float* result = (float*)malloc(sizeof(float) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", list_size);
    }

    // assume that cl_uint size is 4 bytes, same as float size
    cl_uint* start_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);
    cl_uint* calculated_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);
    memset(calculated_offsets, 0, sizeof(uint32_t) * thread_count);
    for (uint32_t i = 0; i < list_size; i++)
    {
        A[i] = (float)(i * 0.5);
    }

    // tell each thread where to start
    for (uint32_t i = 0; i < thread_count; i++)
    {
        uint32_t localId = i % local_size;
        uint32_t groupId = i / local_size;
        start_offsets[i] = (cl_uint)((groupId * skip * local_size + localId) % (float4size - 1));

        // randomly start each workgroup somewhere - ends up being really bad
        /*cl_uint groupOffset = rand() % (float4size / local_size);
        start_offsets[i] = (cl_uint)((groupOffset * local_size + localId) % (float4size - 1));*/
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(float), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(float), A, 0, NULL, NULL);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);
    cl_mem start_offsets_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint) * thread_count, NULL, &ret);
    if (ret != 0) fprintf(stderr, "create buffer for start offsets failed. ret = %d\n", ret);
    ret = clEnqueueWriteBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(cl_uint) * thread_count, start_offsets, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue write buffer for start offsets failed. ret = %d\n", ret);

    // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&float4size);
    clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&result_obj);
    clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&skip);
    clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&start_offsets_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();

    // each thread does iterations reads
    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;
    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;

    //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

    ret = clEnqueueReadBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, calculated_offsets, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for start offsets failed. ret = %d\n", ret);
    clFinish(command_queue);

    if (memcmp(calculated_offsets, start_offsets, sizeof(uint32_t) * thread_count))
    {
        fprintf(stderr, "mismatch in calculated start offsets\n");
        for (uint32_t i = 0; i < thread_count; i++)
        {
            if (calculated_offsets[i] != start_offsets[i]) {
                fprintf(stderr, "At index %u, calculated from GPU = %u, calculated on CPU = %u. skip=%u\n", i, calculated_offsets[i], start_offsets[i], skip);
                break;
            }
        }
    }

    //fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    clReleaseMemObject(start_offsets_obj);
    free(A);
    free(result);
    free(start_offsets);
    free(calculated_offsets);
    return bandwidth;
}

float tex_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint64_t width,
    uint64_t height,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t randomize,
    uint32_t chase_iterations,
    int64_t *time_ms)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float texels = 0;
    cl_int ret;
    int64_t time_diff_ms;
    uint64_t tex_array_size = 3 * width * height; // texture size in bytes
    cl_mem tex_mem_obj = NULL, a_mem_obj = NULL, result_obj = NULL;

    float* A = (float*)malloc(sizeof(float) * tex_array_size);
    float* result = (float*)malloc(sizeof(float) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for %lu x %lu texture\n", width, height);
    }

    // fill array
    for (uint64_t i = 0; i < tex_array_size; i++)
    {
        A[i] = randomize ? rand() * 0.2f : (float)(i * 0.5);
    }

    // create texture from it
    //a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, tex_array_size * sizeof(float), A, &ret);
    //ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, tex_array_size * sizeof(float), A, 0, NULL, NULL);
    cl_image_desc imageDesc;
    memset(&imageDesc, 0, sizeof(cl_image_desc));
    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
    imageDesc.image_width = width;
    imageDesc.image_height = height;
    //imageDesc.mem_object = a_mem_obj;
    //imageDesc.buffer = A;
    cl_image_format imageFormat;
    imageFormat.image_channel_order = CL_R;
    imageFormat.image_channel_data_type = CL_FLOAT;
    tex_mem_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, A, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to create 2d texture: %d\n", ret);
        goto tex_bw_cleanup;
    }

    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { width, height, 1 };
    ret = clEnqueueWriteImage(command_queue, tex_mem_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to copy 2d texture: %d\n", ret);
        goto tex_bw_cleanup;
    }

    fprintf(stderr, "Created image\n");

    // copy array to device
    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);

    // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        texels = 0;
        goto tex_bw_cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        texels = 0;
        goto tex_bw_cleanup;
    }

    time_diff_ms = end_timing();
    fprintf(stderr, "elapsed time: %lld ms\n", time_diff_ms);

    // each thread does iterations samples, and each sample returns a 4-wide vector
    texels = 1000 * (float)(chase_iterations * thread_count * 4 / 1e9) / (float)time_diff_ms;
    fprintf(stderr, "%u iterations, %u threads, %lu ms\n", chase_iterations, thread_count, time_diff_ms);

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

    *time_ms = time_diff_ms;

tex_bw_cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(tex_mem_obj);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return texels;
}

// must be at least as large as local memory test size in kernel
// list size in 32-bit elements
#define local_mem_bw_test_size 8192
float local_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t *time_ms)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float bandwidth, total_data_gb;
    cl_int ret;
    int64_t time_diff_ms;

    float* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);
    float* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
    }

    for (uint32_t i = 0; i < local_mem_bw_test_size; i++)
    {
        A[i] = i + .02;
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(float), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(float), A, 0, NULL, NULL);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();
    *time_ms = time_diff_ms;

    // each thread does iterations reads
    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;

    //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return bandwidth;
}

#define buffer_test_size 4096 // 1024x uint4
float buffer_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t* time_ms)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float bandwidth, total_data_gb;
    cl_int ret;
    int64_t time_diff_ms;
    cl_mem result_obj;

    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * buffer_test_size);
    float* result = (uint32_t*)malloc(sizeof(float) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
    }

    for (uint32_t i = 0; i < buffer_test_size; i++)
    {
        A[i] = i + 1;
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_test_size * sizeof(uint32_t), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, buffer_test_size * sizeof(uint32_t), A, 0, NULL, NULL);

    // handle cl_image stuff
    cl_image_format imageFormat;
    imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
    imageFormat.image_channel_order = CL_R;
    cl_image_desc imageDesc;
    memset(&imageDesc, 0, sizeof(cl_image_desc));
    imageDesc.buffer = a_mem_obj;
    imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
    imageDesc.image_width = buffer_test_size; // width in pixels
    cl_mem tex_obj = tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);

    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { imageDesc.image_width, 1, 1 };
    ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);

    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();
    *time_ms = time_diff_ms;

    // each thread does iterations reads
    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;

    //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return bandwidth;
}

float local_chase_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    uint32_t wave_size,
    int64_t* time_ms)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float bandwidth, total_data_gb;
    cl_int ret;
    int64_t time_diff_ms;

    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);
    uint32_t* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
    }

    for (uint32_t i = 0; i < local_mem_bw_test_size; i++)
    {
        // assumes local_mem_bw_test_size is a power of 2.
        A[i] = i + wave_size & (local_mem_bw_test_size - 1);
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(uint32_t), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(uint32_t), A, 0, NULL, NULL);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();
    *time_ms = time_diff_ms;

    // each thread does iterations reads
    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;

    //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return bandwidth;
}

#define local64_test_size 2048
float local_64_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t* time_ms)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float bandwidth, total_data_gb;
    cl_int ret;
    int64_t time_diff_ms;

    uint64_t* A = (uint64_t*)malloc(sizeof(uint64_t) * local64_test_size);
    uint64_t* result = (uint64_t*)malloc(sizeof(uint64_t) * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local64_test_size * 4);
    }

    for (uint64_t i = 0; i < local64_test_size; i++)
    {
        A[i] = i;
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local64_test_size * sizeof(uint64_t), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local64_test_size * sizeof(uint64_t), A, 0, NULL, NULL);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint64_t) * thread_count, NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);

    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue); // writes should be blocking, but are they?

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        bandwidth = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();
    *time_ms = time_diff_ms;

    // each thread does iterations reads
    total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;
    bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);
    if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
    clFinish(command_queue);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return bandwidth;
}


// default test sizes for link bandwidth
const uint64_t default_link_test_sizes[] = { 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152 };

void link_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations)
{
    cl_int ret;
    cl_int result = 0;
    size_t global_item_size;
    size_t local_item_size = 1;
    float gpu_to_host_bandwidth, host_to_gpu_bandwidth, total_data_gb;
    uint32_t time_diff_ms, loop_iterations;
    uint32_t* A;

    int test_size_count = sizeof(default_link_test_sizes) / sizeof(unsigned long long);
    float* results = (float*)malloc(sizeof(float) * 2 * test_size_count);
    memset(results, 0, sizeof(float) * 2 * test_size_count);

    printf("Copy Size (KB), Host to GPU (GB/s), GPU to Host (GB/s)\n");
    for (int size_idx = 0; size_idx < test_size_count; size_idx++) {
        uint64_t testSizeBytes = default_link_test_sizes[size_idx] * 1024;
        uint64_t testSizeKb = default_link_test_sizes[size_idx];

        if (testSizeBytes > max_global_test_size) {
            printf("%d K would exceed device's max buffer size of %lu K, stopping here.\n", testSizeKb, max_global_test_size / 1024);
            break;
        }

        A = (uint32_t*)malloc(testSizeBytes);
        memset(A, 0, testSizeBytes);
        cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, testSizeBytes, NULL, &ret);
        clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
        global_item_size = 1; // only hit the first element, not like we're going to spend time verifying an entire arr especially at large sizes

        // use 1M iterations = 1 GB total to transfer
        loop_iterations = ((uint64_t)iterations * 1000) / (uint64_t)testSizeBytes;
        //fprintf(stderr, "Size: %llu KB, Iterations: %d, base iterations: %d\n", testSizeKb, loop_iterations, iterations);

        start_timing();
        for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)
        {
            ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);
            clFinish(command_queue);
        }
        time_diff_ms = end_timing();
        total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;
        host_to_gpu_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
        results[size_idx * 2] = host_to_gpu_bandwidth;
        //fprintf(stderr, "Write to GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms);

        start_timing();
        for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)
        {
            ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);
            clFinish(command_queue);
        }
        time_diff_ms = end_timing();
        total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;
        gpu_to_host_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
        results[size_idx * 2 + 1] = gpu_to_host_bandwidth;
        //fprintf(stderr, "Read from GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms);

        printf("%llu,%f,%f\n", testSizeKb, host_to_gpu_bandwidth, gpu_to_host_bandwidth);

        clReleaseMemObject(a_mem_obj);
        free(A);
    }

    float max = 0;
    for (int size_idx = 0; size_idx < test_size_count; size_idx++) {
        if (results[size_idx * 2] > max) max = results[size_idx * 2];
        if (results[size_idx * 2 + 1] > max) max = results[size_idx * 2 + 1];
    }

    printf("Link bandwidth: %f GB/s\n", max);

cleanup:
    free(results);
    clFlush(command_queue);
    clFinish(command_queue);
}


================================================
FILE: GpuMemLatency/common.c
================================================
#include "opencltest.h"

cl_device_id selected_device_id;
cl_platform_id selected_platform_id;
cl_ulong max_global_test_size;
int saveprogram = 0;

// Fills an array using Sattolo's algo
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

cl_uint getCuCount() {
    cl_uint cuCount;
    size_t cuCountLen = sizeof(cl_uint);
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, cuCountLen, &cuCount, &cuCountLen))
    {
        fprintf(stderr, "Could not get number of compute units\n");
        return 0;
    }

    return cuCount;
}

size_t getMaxWorkgroupSize()
{
    size_t maxWorkgroupSize;
    size_t workgroupSizeLen = sizeof(size_t);
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, workgroupSizeLen, &maxWorkgroupSize, &workgroupSizeLen))
    {
        fprintf(stderr, "Could not get number of compute units\n");
        return 0;
    }

    return maxWorkgroupSize;
}

cl_ulong get_max_constant_buffer_size() {
    cl_ulong constant_buffer_size = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &constant_buffer_size, NULL)) {
        fprintf(stderr, "Failed to get max constant buffer size\n");
    }

    return constant_buffer_size;
}

cl_ulong get_max_buffer_size() {
    cl_ulong buffer_size = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {
        fprintf(stderr, "Failed to get max constant buffer size\n");
    }

    return buffer_size;
}

cl_ulong get_max_tex_buffer_size() {
    cl_ulong buffer_size = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {
        fprintf(stderr, "Failed to get max texture buffer size\n");
    }

    return buffer_size;
}

cl_ulong get_max_2d_tex_width() {
    cl_ulong max_width = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(cl_ulong), &max_width, NULL)) {
        fprintf(stderr, "Failed to get max texture width\n");
    }

    return max_width;
}

cl_ulong get_max_2d_tex_height() {
    cl_ulong max_width = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(cl_ulong), &max_width, NULL)) {
        fprintf(stderr, "Failed to get max texture height\n");
    }

    return max_width;
}

short checkExtensionSupport(const char *extension_name) {
    size_t extensionLen = 0;
    char* extensions;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &extensionLen))
    {
        fprintf(stderr, "Could not determine memory needed to hold OpenCL extension list\n");
        return 0;
    }

    extensions = (char *)malloc(extensionLen + 1);
    extensions[extensionLen] = 0;
    if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, extensionLen, extensions, &extensionLen))
    {
        fprintf(stderr, "Could not get OpenCL extensions list\n");
        return 0;
    }

    //fprintf(stderr, "OpenCL extensions list: %s\n", extensions);
    // extension list is space separated
    size_t spaceCount = 0;
    for (int i = 0; i < extensionLen; i++) {
        if (extensions[i] == ' ') spaceCount++;
    }

    int* extensionsSpaces = (int*)malloc(sizeof(int) * (spaceCount + 1));
    extensionsSpaces[0] = 0;
    int spaceIdx = 1;
    for (int i = 0; i < extensionLen; i++) {
        if (extensions[i] == ' ') {
            extensions[i] = 0;
            extensionsSpaces[spaceIdx] = i + 1;
            spaceIdx++;
        }
    }

    short found = 0;
    for (int i = 0; i < spaceCount; i++)
    {
        //fprintf(stderr, "Looking for %s = %s\n", extension_name, extensions + extensionsSpaces[i]);
        if (strcmp(extension_name, extensions + extensionsSpaces[i]) == 0) {
            found = 1;
            //fprintf(stderr, "found\n");
            break;
        }
    }

    free(extensionsSpaces);
    free(extensions);
    return found;
}

/// <summary>
/// populate global variables for opencl device id and platform id
/// </summary>
/// <param name="platform_index">platform index. if -1, prompt user</param>
/// <param name="device_index">device index. if -1. prompt user</param>
/// <returns>opencl context</returns>
cl_context get_context_from_user(int platform_index, int device_index) {
    int i = 0;
    int selected_platform_index = 0, selected_device_index = 0;

    // Get platform and device information
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

    cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
    cl_platform_id* platforms = NULL;
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));

    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
    fprintf(stderr, "clGetPlatformIDs returned %d. %d platforms\n", ret, ret_num_platforms);

    for (i = 0; i < ret_num_platforms; i++)
    {
        size_t platform_name_len;
        char* platform_name = NULL;
        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) {
            fprintf(stderr, "Failed to get platform info for platform %d\n", i);
            continue;
        }

        platform_name = (char*)malloc(platform_name_len + 1);
        platform_name[platform_name_len] = 0;

        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) {
            fprintf(stderr, "Failed to get platform name for platform %d\n", i);
            free(platform_name);
            continue;
        }

        fprintf(stderr, "Platform %d: %s\n", i, platform_name);
        free(platform_name);
    }

    selected_platform_index = platform_index;
    if (selected_platform_index == -1)
    {
        printf("Enter platform #:");
        scanf("%d", &selected_platform_index);
    }

    if (selected_platform_index > ret_num_platforms - 1)
    {
        fprintf(stderr, "platform index out of range\n");
        goto get_context_from_user_end;
    }

    selected_platform_id = platforms[selected_platform_index];

    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) {
        fprintf(stderr, "Failed to enumerate device ids for platform");
        return NULL;
    }

    devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));
    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) {
        fprintf(stderr, "Failed to get device ids for platform");
        free(devices);
        return NULL;
    }

    fprintf(stderr, "clGetDeviceIDs returned %d devices\n", ret_num_devices);

    for (i = 0; i < ret_num_devices; i++)
    {
        size_t device_name_len;
        char* device_name = NULL;
        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) {
            fprintf(stderr, "Failed to get name length for device %d\n", i);
            continue;
        }

        //fprintf(stderr, "debug: device name length: %d\n", device_name_len);
        device_name = (char*)malloc(device_name_len + 1);
        device_name[device_name_len] = 0;

        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) {
            fprintf(stderr, "Failed to get name for device %d\n", i);
            free(device_name);
            continue;
        }

        fprintf(stderr, "Device %d: %s\n", i, device_name);
        free(device_name);
    }

    selected_device_index = device_index;
    if (selected_device_index == -1)
    {
        fprintf(stderr, "Enter device #:");
        scanf("%d", &selected_device_index);
    }


    if (selected_device_index > ret_num_devices - 1)
    {
        fprintf(stderr, "Device index out of range\n");
        goto get_context_from_user_end;
    }

    selected_device_id = devices[selected_device_index];

    // Create an OpenCL context
    context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret);
    fprintf(stderr, "clCreateContext returned %d\n", ret);
    fprintf(stderr, "Max workgroup size for device: %u\n", getMaxWorkgroupSize());

get_context_from_user_end:
    free(platforms);
    free(devices);
    return context;
}

cl_program build_program(cl_context context, const char* fname, const char *params)
{
    cl_int ret;
    FILE* fp = NULL;
    char* source_str;
    size_t source_size;
    fp = fopen(fname, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel %s.\n", fname);
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);
    ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL);
    //fprintf(stderr, "clBuildProgram %s returned %d\n", fname, ret);
    if (ret == -11)
    {
        size_t log_size;
        fprintf(stderr, "OpenCL kernel build error\n");
        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char* log = (char*)malloc(log_size);
        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        fprintf(stderr, "%s\n", log);
        free(log);
    }

    free(source_str);
    return program;
}

void write_program(cl_program program, const char *name)
{
    size_t* binarySizes = NULL;
    size_t nDevices = 0;
    cl_int ret, memoryRequired = 0;
    char fname[255];
    int i;
    unsigned char** binaries = NULL;

    ret = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nDevices, NULL);
    if (ret != CL_SUCCESS) {
        fprintf(stderr, "Could not get number of devices for program\n");
        return;
    }

    fprintf(stderr, "Program is associated with %llu devices\n", nDevices);
    binarySizes = (size_t*)malloc(sizeof(size_t) * nDevices);
    if (binarySizes == NULL)
    {
        fprintf(stderr, "Failed to allocate memory for binary sizes\n");
        goto getProgram_Fail;
    }

    ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, binarySizes, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Could not get program binary sizes\n");
        goto getProgram_Fail;
    }

    binaries = (unsigned char*)malloc(nDevices);
    for (i = 0; i < nDevices; i++) {
        fprintf(stderr, "Device %d: %llu byte program\n", i, binarySizes[i]);
        binaries[i] = (char*)malloc(binarySizes[i]);
    }

    ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, nDevices * sizeof(unsigned char*), binaries, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Could not get program binaries\n");
        goto getProgram_Fail;
    }

    for (int i = 0; i < nDevices; i++)
    {
        snprintf(fname, 254, "prog%d_%s", i, name);
        FILE* dst = fopen(fname, "w");
        fwrite(binaries[i], 1, binarySizes[i], dst);
        fclose(dst);
        fprintf(stderr, "Wrote compiled kernel to %s\n", fname);
    }

getProgram_Fail:
    for (int i = 0; i < nDevices; i++) free(binaries[i]);
    free(binaries);
    free(binarySizes);
}

// Given last run settings, return target iteration count that should make the next run
// go for approximately TARGET_TIME_MS
uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms)
{
    uint32_t chase_iterations = (uint32_t)((float)iterations * TARGET_TIME_MS / (float)time_ms);
    if (time_ms == 0) chase_iterations = iterations * 100;
    //fprintf(stderr, "Kernel took %llu ms. Setting iterations = %u\n", time_ms, chase_iterations);

    return chase_iterations;
}

================================================
FILE: GpuMemLatency/instruction_rate.c
================================================
#include "opencltest.h"

float fp64_instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result);

float fp16_instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result);

float run_rate_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result,
    float totalOps);

float run_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result,
    float opsPerIteration);

float global_totalOps;

float instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int forcefp16,
    int forcefp64)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float gOpsPerSec = 0, opsPerIteration;
    cl_int ret;
    int64_t time_diff_ms;
    int float4_element_count = thread_count * 4;

    cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL);
    if (saveprogram) write_program(program, "irate");
    cl_kernel int32_add_rate_kernel = clCreateKernel(program, "int32_add_rate_test", &ret);
    cl_kernel int32_mul_rate_kernel = clCreateKernel(program, "int32_mul_rate_test", &ret);
    cl_kernel fp32_add_rate_kernel = clCreateKernel(program, "fp32_add_rate_test", &ret);
    cl_kernel fp32_fma_rate_kernel = clCreateKernel(program, "fp32_fma_rate_test", &ret);
    cl_kernel fp32_builtin_fma_rate_kernel = clCreateKernel(program, "fp32_builtin_fma_rate_test", &ret);
    cl_kernel fp32_mad_rate_kernel = clCreateKernel(program, "fp32_mad_rate_test", &ret);
    cl_kernel fp32_rcp_rate_kernel = clCreateKernel(program, "fp32_rcp_rate_test", &ret);
    cl_kernel fp32_rsqrt_rate_kernel = clCreateKernel(program, "fp32_rsqrt_rate_test", &ret);
    cl_kernel mix_fp32_int32_add_rate_kernel = clCreateKernel(program, "mix_fp32_int32_add_rate_test", &ret);
    cl_kernel mix_fp32_int32_addmul_rate_kernel = clCreateKernel(program, "mix_fp32_int32_addmul_rate_test", &ret);
    cl_kernel int64_add_rate_kernel = clCreateKernel(program, "int64_add_rate_test", &ret);
    cl_kernel int64_mul_rate_kernel = clCreateKernel(program, "int64_mul_rate_test", &ret);
    cl_kernel int16_add_rate_kernel = clCreateKernel(program, "int16_add_rate_test", &ret);
    cl_kernel int16_mul_rate_kernel = clCreateKernel(program, "int16_mul_rate_test", &ret);
    cl_kernel int8_add_rate_kernel = clCreateKernel(program, "int8_add_rate_test", &ret);
    cl_kernel int8_mul_rate_kernel = clCreateKernel(program, "int8_mul_rate_test", &ret);
    cl_kernel fp32_fma_latency_kernel = clCreateKernel(program, "fp32_fma_latency_test", &ret);
    cl_kernel fp32_add_latency_kernel = clCreateKernel(program, "fp32_add_latency_test", &ret);
    cl_kernel int32_add_latency_kernel = clCreateKernel(program, "int32_add_latency_test", &ret);
    cl_kernel int32_mul_latency_kernel = clCreateKernel(program, "int32_mul_latency_test", &ret);

    cl_kernel int32_add_scalar_latency_kernel = clCreateKernel(program, "int32_add_scalar_latency_test", &ret);
    cl_kernel int32_mul_scalar_latency_kernel = clCreateKernel(program, "int32_mul_scalar_latency_test", &ret);
    cl_kernel fp32_add_scalar_latency_kernel = clCreateKernel(program, "fp32_add_scalar_latency_test", &ret);
    cl_kernel fp32_fma_scalar_latency_kernel = clCreateKernel(program, "fp32_fma_scalar_latency_test", &ret);
    cl_kernel fp32_mul_scalar_latency_kernel = clCreateKernel(program, "fp32_mul_scalar_latency_test", &ret);
    cl_kernel fp32_mul_latency_kernel = clCreateKernel(program, "fp32_mul_latency_test", &ret);

    float* A = (float*)malloc(sizeof(float) * float4_element_count * 4);
    float* result = (float*)malloc(sizeof(float) * 4 * thread_count);

    if (!A || !result)
    {
        fprintf(stderr, "Failed to allocate memory instruction rate test\n");
    }

    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, float4_element_count * sizeof(float), NULL, &ret);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 4 * thread_count, NULL, &ret);

    // Integer test first
    uint32_t *int32_A = (uint32_t*)A;
    for (int i = 0; i < float4_element_count * 4; i++)
    {
        int32_A[i] = i + 1;
    }

    // 4x int4 * 8 per iteration, and count the loop increment too
    opsPerIteration = 4.0f * 8.0f;
    float int32_add_rate = run_rate_test(context, command_queue, int32_add_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT32 G Adds/sec: %f\n", int32_add_rate);

    printf("===== INT32 add latency =====\n");
    float int32_add_latency = run_latency_test(context, command_queue, int32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "INT32 add latency: %f ns\n", int32_add_latency);

    printf("===== INT32 add latency (scalar) =====\n");
    int32_add_latency = run_latency_test(context, command_queue, int32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "INT32 add latency (scalar): %f ns\n", int32_add_latency);

    printf("===== INT32 mul latency =====\n");
    float int32_mul_latency = run_latency_test(context, command_queue, int32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "INT32 mul latency: %f ns\n", int32_mul_latency);

    printf("===== INT32 mul latency (scalar) =====\n");
    int32_mul_latency = run_latency_test(context, command_queue, int32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "INT32 mul latency (scalar): %f ns\n", int32_mul_latency);

    opsPerIteration = 4.0f * 8.0f;
    float int32_mul_rate = run_rate_test(context, command_queue, int32_mul_rate_kernel, thread_count, local_size, (chase_iterations / 2),
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT32 G Multiplies/sec: %f\n", int32_mul_rate);

    // FP32 add and fma test
    cl_float* fp32_A = (cl_float*)A;
    for (int i = 0; i < float4_element_count * 4; i++)
    {
        fp32_A[i] = 0.5f * i;
    }

    opsPerIteration = 4.0f * 8.0f;

    float fp32_add_rate = run_rate_test(context, command_queue, fp32_add_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G Adds/sec: %f\n", fp32_add_rate);

    printf("===== FP32 add latency =====\n");
    float fp32_add_latency = run_latency_test(context, command_queue, fp32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 add latency: %f ns\n", fp32_add_latency);

    printf("===== FP32 add latency (scalar) =====\n");
    fp32_add_latency = run_latency_test(context, command_queue, fp32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 add latency (scalar): %f ns\n", fp32_add_latency);

    printf("===== FP32 fma latency =====\n");
    float fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 FMA latency: %f ns\n", fp32_fma_latency);

    printf("===== FP32 fma latency (scalar) =====\n");
    fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 FMA latency (scalar): %f ns\n", fp32_fma_latency);

    printf("===== FP32 mul latency =====\n");
    fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 mul latency: %f ns\n", fp32_fma_latency);
    fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
    fprintf(stderr, "FP32 mul latency (scalar): %f ns\n", fp32_fma_latency);

    float fp32_fma_rate = run_rate_test(context, command_queue, fp32_fma_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G FMA/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2); 

    float builtin_fp32_fma_rate = run_rate_test(context, command_queue, fp32_builtin_fma_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G fma()/sec: %f : %f GFLOPs\n", builtin_fp32_fma_rate, builtin_fp32_fma_rate * 2);

    fp32_fma_rate = run_rate_test(context, command_queue, fp32_mad_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G mad()/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2);

    float fp32_rcp_rate = run_rate_test(context, command_queue, fp32_rcp_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G native_recip/sec: %f\n", fp32_rcp_rate);

    float fp32_rsqrt_rate = run_rate_test(context, command_queue, fp32_rsqrt_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "FP32 G native_rsqrt/sec: %f\n", fp32_rsqrt_rate);

    // Mixed INT32 and FP32 - 4 FP32, 4 INT32, and the loop increment
    // takes FP inputs and converts some to int
    opsPerIteration = 4.0f * 8.0f + 1.0f;
    float mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_add_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "Mixed INT32 and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate);

    // Test the same with integer multiplies
    mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_addmul_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "Mixed INT32 Multiplies and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate);

    // INT64 add test
    cl_ulong* int64_A = (cl_ulong*)A;
    for (int i = 0; i < float4_element_count * 2; i++)
    {
        int64_A[i] = i * 2;
    }

    opsPerIteration = 2.0f * 8.0f;
    float int64_add_rate = run_rate_test(context, command_queue, int64_add_rate_kernel, thread_count, local_size, chase_iterations / 2,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT64 G Adds/sec: %f\n", int64_add_rate);

    opsPerIteration = 2.0f * 8.0f;
    float int64_mul_rate = run_rate_test(context, command_queue, int64_mul_rate_kernel, thread_count, local_size, chase_iterations / 8,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT64 G Multiplies/sec: %f\n", int64_mul_rate);

    // INT16 (short) tests
    cl_ushort* int16_A = (cl_ushort*)A;
    for (int i = 0; i < float4_element_count * 8; i++)
    {
        int16_A[i] = i;
    }

    // short8
    opsPerIteration = 8.0f * 8.0f;
    float int16_add_rate = run_rate_test(context, command_queue, int16_add_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT16 G Adds/sec: %f\n", int16_add_rate);

    float int16_mul_rate = run_rate_test(context, command_queue, int16_mul_rate_kernel, thread_count, local_size, chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT16 G Multiplies/sec: %f \n", int16_mul_rate);

    // INT8 (char) tests
    cl_char* int8_A = (cl_char*)A;
    for (int i = 0; i < float4_element_count * 8; i++)
    {
        int8_A[i] = i;
    }

    uint32_t int8_chase_iterations = chase_iterations / 10;
    opsPerIteration = 16.0f * 8.0f;
    float int8_add_rate = run_rate_test(context, command_queue, int8_add_rate_kernel, thread_count, local_size, int8_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT8 G Adds/sec: %f\n", int8_add_rate);

    float int8_mul_rate = run_rate_test(context, command_queue, int8_mul_rate_kernel, thread_count, local_size, int8_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
    fprintf(stderr, "INT8 G Multiplies/sec: %f\n", int8_mul_rate);

    short checkExtensionSupport(const char *extension_name);
    
    if (checkExtensionSupport("cl_khr_fp64") || forcefp64) {
        fp64_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,
            a_mem_obj, result_obj, A, result);
    }
    else {
        fprintf(stderr, "FP64 not supported\n");
    }

    if (checkExtensionSupport("cl_khr_fp16") || forcefp16) {
        fp16_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,
            a_mem_obj, result_obj, A, result);
    }
    else {
        fprintf(stderr, "FP16 not supported\n");
    }

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    return gOpsPerSec;
}

// Runs an instruction rate test. The kernel is expected to perform opsPerIteration * chase_iterations operations
// Mostly simplifies the uber instruction rate test above. Expects memory to be pre-allocated for example.
// Returns GOPS
float run_rate_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result,
    float opsPerIteration)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    cl_int ret;
    float totalOps, gOpsPerSec;
    uint64_t time_diff_ms = 0;

    memset(result, 0, sizeof(float) * 4 * thread_count);

    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4 * thread_count, result, 0, NULL, NULL);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue);

    //fprintf(stderr, "Submitting fp32 add kernel to command queue\n");
    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
    while (time_diff_ms < TARGET_TIME_MS / 2) {
        start_timing();
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
            gOpsPerSec = 0;
            return 0;
        }

        ret = clFinish(command_queue);
        if (ret != CL_SUCCESS)
        {
            printf("Failed to finish command queue. clFinish returned %d\n", ret);
            gOpsPerSec = 0;
            return 0;
        }

        time_diff_ms = end_timing();

        totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count;
        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
        //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
        //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);

        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    }

    return gOpsPerSec;
}


// Variation of the test above but input array size is aligned with assumed wave size.
// if partitioning pattern, this will test partitioning with active waves in the specified pattern
float run_divergence_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t wave,
    int *partitionPattern)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    uint32_t active_threads = thread_count;
    cl_int ret;
    float totalOps, gOpsPerSec;
    uint64_t time_diff_ms = 0;
    uint32_t chase_iterations = 2500000;

    cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL);
    cl_kernel kernel = clCreateKernel(program, partitionPattern == NULL ? "fp32_divergence_rate_test" : "fp32_partition_rate_test", &ret);
    
    float* result = (float*)malloc(sizeof(float) * thread_count);
    float* A = (float*)malloc(sizeof(float) * thread_count);
    memset(result, 0, sizeof(float) * thread_count);

    if (partitionPattern != NULL) active_threads = 0;
    if (partitionPattern != NULL) fprintf(stderr, "\n");

    for (int i = 0; i < thread_count; i++)
    {
        if (partitionPattern == NULL) {
            // divergence test
            if ((i / wave) % 2 == 0) A[i] = 0.2f;
            else A[i] = 0.8f;
        }
        else
        {
            if (partitionPattern[(i / wave)]) {
                A[i] = 0.2f;
                fprintf(stderr, "a ");
                active_threads++;
            }
            else
            {
                fprintf(stderr, "_ ");
                A[i] = 1.2f;
            }

            if ((i + 1) % wave == 0)
            {
                fprintf(stderr, "\n");
            }
        }
    }

    if (partitionPattern != NULL) fprintf(stderr, "\nActive threads: %d\n", active_threads);

    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, thread_count * sizeof(float), NULL, &ret);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, thread_count * sizeof(float), NULL, &ret);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, thread_count * sizeof(float), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, thread_count * sizeof(float), result, 0, NULL, NULL);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue);

    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
    while (time_diff_ms < TARGET_TIME_MS / 2) {
        start_timing();
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
            gOpsPerSec = 0;
            return 0;
        }

        ret = clFinish(command_queue);
        if (ret != CL_SUCCESS)
        {
            printf("Failed to finish command queue. clFinish returned %d\n", ret);
            gOpsPerSec = 0;
            return 0;
        }

        time_diff_ms = end_timing();

        totalOps = (float)chase_iterations * 8 * (float)active_threads;
        gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
        //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
        //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);

        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    }

    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    free(result);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    return gOpsPerSec;
}

// often takes time for clocks to settle?
#define LATENCY_REPEAT 5
float run_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result,
    float opsPerIteration)
{
    size_t global_item_size = 1;
    size_t local_item_size = 1;
    cl_int ret;
    float latency;
    uint64_t time_diff_ms = 0;

    // hack around latency taking longer
    chase_iterations = chase_iterations / 50;

    // testing returning a float4
    memset(result, 0, sizeof(float) * 4);

    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    clFinish(command_queue);

    //fprintf(stderr, "Submitting fp32 add kernel to command queue\n");
    // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
    while (time_diff_ms < TARGET_TIME_MS / 2) {
        start_timing();
        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
            latency = 0;
            return 0;
        }

        ret = clFinish(command_queue);
        if (ret != CL_SUCCESS)
        {
            printf("Failed to finish command queue. clFinish returned %d\n", ret);
            latency = 0;
            return 0;
        }

        time_diff_ms = end_timing();
        chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
        clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    }

    float totalOps = (float)chase_iterations * opsPerIteration * (float)global_item_size;
    latency = (float)time_diff_ms * 1e6 / totalOps;
    // fprintf(stderr, "\tinitial run: %f ns latency\n", latency);

    float minLatency = 0.0f;
    for (int i = 0; i < LATENCY_REPEAT; i++)
    {
        start_timing();
        clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
        clFinish(command_queue);
        time_diff_ms = end_timing();
        latency = (float)time_diff_ms * 1e6 / totalOps;
        // fprintf(stderr, "\trun %d: %f ns latency\n", i, latency);
        if (i == 0 || latency < minLatency) minLatency = latency;
    }

    //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
    //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);
    return minLatency;
}

// taking out FP64 because some implementations don't support it. putting another build program + create kernel section
// in the main instruction rate test function would be too messy
float fp64_instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float *A,
    cl_float*result)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float gOpsPerSec, totalOps;
    cl_int ret;
    int64_t time_diff_ms;

    // FP64 add test
    uint32_t low_chase_iterations = chase_iterations / 4;
    cl_double* fp64_A = (cl_double*)A;
    for (int i = 0; i < float4_element_count * 2; i++)
    {
        fp64_A[i] = 0.5f * i;
    }

    memset(result, 0, sizeof(float) * 4 * thread_count);

    cl_program program = build_program(context, "instruction_rate_fp64_kernel.cl", NULL);
    if (saveprogram) write_program(program, "fp64irate");
    cl_kernel fp64_add_rate_kernel = clCreateKernel(program, "fp64_add_rate_test", &ret);
    cl_kernel fp64_fma_rate_kernel = clCreateKernel(program, "fp64_fma_rate_test", &ret);
    cl_kernel fp64_mad_rate_kernel = clCreateKernel(program, "fp64_mad_rate_test", &ret);
    totalOps = 2.0f * 8.0f;
    gOpsPerSec = run_rate_test(context, command_queue, fp64_add_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP64 G Adds/sec: %f\n", gOpsPerSec);
    gOpsPerSec = run_rate_test(context, command_queue, fp64_fma_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP64 G FMAs/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);
    gOpsPerSec = run_rate_test(context, command_queue, fp64_mad_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP64 G mad()/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);

    return gOpsPerSec;
}

// taking out FP16 too because it requires an extension to be supported
float fp16_instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int float4_element_count,
    cl_mem a_mem_obj,
    cl_mem result_obj,
    cl_float* A,
    cl_float* result)
{
    size_t global_item_size = thread_count;
    size_t local_item_size = local_size;
    float gOpsPerSec, totalOps;
    cl_int ret;
    int64_t time_diff_ms;

    // FP64 add test
    uint32_t low_chase_iterations = chase_iterations / 4;
    cl_half* fp16_A = (cl_float*)A;
    for (int i = 0; i < float4_element_count * 8; i++)
    {
        fp16_A[i] = (cl_half)(0.5f * i);
    }

    memset(result, 0, sizeof(float) * 4 * thread_count);

    cl_program program = build_program(context, "instruction_rate_fp16_kernel.cl", NULL);
    if (saveprogram) write_program(program, "fp16irate");
    cl_kernel fp16_add_rate_kernel = clCreateKernel(program, "fp16_add_rate_test", &ret);
    cl_kernel fp16_fma_rate_kernel = clCreateKernel(program, "fp16_fma_rate_test", &ret);
    //cl_kernel fp16_rsqrt_rate_kernel = clCreateKernel(program, "fp16_rsqrt_rate_test", &ret);
    totalOps = 8.0f * 8.0f;
    gOpsPerSec = run_rate_test(context, command_queue, fp16_add_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP16 G Adds/sec: %f\n", gOpsPerSec);
    gOpsPerSec = run_rate_test(context, command_queue, fp16_fma_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP16 G FMAs/sec: %f : %f FP16 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);
    /*gOpsPerSec = run_rate_test(context, command_queue, fp16_rsqrt_rate_kernel, thread_count, local_size, low_chase_iterations,
        float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
    fprintf(stderr, "FP16 G native_rsqrt/sec: %f\n", gOpsPerSec);*/

    return gOpsPerSec;
}


================================================
FILE: GpuMemLatency/instruction_rate_fp16_kernel.cl
================================================
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define rate_local_mem_test_size 256
__kernel void fp16_add_rate_test(__global half8 *A, int count, __global half8 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global half8 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    half8 v0 = local_a[masked_tid];
    half8 v1 = local_a[masked_tid + 1];
    half8 v2 = local_a[masked_tid + 2];
    half8 v3 = local_a[masked_tid + 3];
    half8 v4 = v0 + v1;
    half8 v5 = v0 + v2;
    half8 v6 = v0 + v3;
    half8 v7 = v1 + v2;
    half8 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp16_fma_rate_test(__global half8 *A, int count, __global half8 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global half8 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    half8 v0 = local_a[masked_tid];
    half8 v1 = local_a[masked_tid + 1];
    half8 v2 = local_a[masked_tid + 2];
    half8 v3 = local_a[masked_tid + 3];
    half8 v4 = v0 + v1;
    half8 v5 = v0 + v2;
    half8 v6 = v0 + v3;
    half8 v7 = v1 + v2;
    half8 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc * v0;
        v1 += acc * v1;
        v2 += acc * v2;
        v3 += acc * v3;
        v4 += acc * v4;
        v5 += acc * v5;
        v6 += acc * v6;
        v7 += acc * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

/*__kernel void fp16_rsqrt_rate_test(__global half8 *A, int count, __global half8 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global half8 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    half8 v0 = local_a[masked_tid];
    half8 v1 = local_a[masked_tid + 1];
    half8 v2 = local_a[masked_tid + 2];
    half8 v3 = local_a[masked_tid + 3];
    half8 v4 = v0 + v1;
    half8 v5 = v0 + v2;
    half8 v6 = v0 + v3;
    half8 v7 = v1 + v2;

    for (int i = 0; i < count; i++) {
        v0 = native_rsqrt(v0);
        v1 = native_rsqrt(v1);
        v2 = native_rsqrt(v2);
        v3 = native_rsqrt(v3);
        v4 = native_rsqrt(v4);
        v5 = native_rsqrt(v5);
        v6 = native_rsqrt(v6);
        v7 = native_rsqrt(v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
*/

================================================
FILE: GpuMemLatency/instruction_rate_fp64_kernel.cl
================================================
#define rate_local_mem_test_size 256
__kernel void fp64_add_rate_test(__global double2 *A, int count, __global double2 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global double2 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    double2 v0 = local_a[masked_tid];
    double2 v1 = local_a[masked_tid + 1];
    double2 v2 = local_a[masked_tid + 2];
    double2 v3 = local_a[masked_tid + 3];
    double2 v4 = v0 + v1;
    double2 v5 = v0 + v2;
    double2 v6 = v0 + v3;
    double2 v7 = v1 + v2;
    double2 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp64_fma_rate_test(__global double2 *A, int count, __global double2 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global double2 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    double2 v0 = local_a[masked_tid];
    double2 v1 = local_a[masked_tid + 1];
    double2 v2 = local_a[masked_tid + 2];
    double2 v3 = local_a[masked_tid + 3];
    double2 v4 = v0 + v1;
    double2 v5 = v0 + v2;
    double2 v6 = v0 + v3;
    double2 v7 = v1 + v2;
    double2 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc * v0;
        v1 += acc * v1;
        v2 += acc * v2;
        v3 += acc * v3;
        v4 += acc * v4;
        v5 += acc * v5;
        v6 += acc * v6;
        v7 += acc * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp64_mad_rate_test(__global double2 *A, int count, __global double2 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global double2 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    double2 v0 = local_a[masked_tid];
    double2 v1 = local_a[masked_tid + 1];
    double2 v2 = local_a[masked_tid + 2];
    double2 v3 = local_a[masked_tid + 3];
    double2 v4 = v0 + v1;
    double2 v5 = v0 + v2;
    double2 v6 = v0 + v3;
    double2 v7 = v1 + v2;
    double2 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 = mad(acc, v0, v0);
        v1 = mad(acc, v1, v1);
        v2 = mad(acc, v2, v2);
        v3 = mad(acc, v3, v3);
        v4 = mad(acc, v4, v3);
        v5 = mad(acc, v5, v5);
        v6 = mad(acc, v6, v6);
        v7 = mad(acc, v7, v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}


================================================
FILE: GpuMemLatency/instruction_rate_kernel.cl
================================================
#define rate_local_mem_test_size 512

// A must be at least (local size * 4) uint32 elements in size, but must not exceed local mem size
// jk it doesn't use local mem now
__kernel void int32_add_rate_test(__global uint4 *A, int count, __global uint4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);

    __local uint4 local_a[rate_local_mem_test_size];
    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);
    // __global uint4 *local_a = A;

    int masked_tid = min(tid, rate_local_mem_test_size - 8);
    uint4 v0 = local_a[masked_tid];
    uint4 v1 = local_a[masked_tid + 1];
    uint4 v2 = local_a[masked_tid + 2];
    uint4 v3 = local_a[masked_tid + 3];
    uint4 v4 = local_a[masked_tid + 4];
    uint4 v5 = local_a[masked_tid + 5];
    uint4 v6 = local_a[masked_tid + 6];
    uint4 v7 = local_a[masked_tid + 7];

    for (int i = 0; i < count; i++) {
        uint4 acc = local_a[i & (rate_local_mem_test_size - 1)];
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int32_mul_rate_test(__global uint4 *A, int count, __global uint4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global uint4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    uint4 v0 = local_a[masked_tid];
    uint4 v1 = local_a[masked_tid + 1];
    uint4 v2 = local_a[masked_tid + 2];
    uint4 v3 = local_a[masked_tid + 3];
    uint4 v4 = v0 + v1;
    uint4 v5 = v0 + v2;
    uint4 v6 = v0 + v3;
    uint4 v7 = v1 + v2;
    uint4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
        v0 *= acc;
        v1 *= acc;
        v2 *= acc;
        v3 *= acc;
        v4 *= acc;
        v5 *= acc;
        v6 *= acc;
        v7 *= acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = v0 + v1;
    float4 v5 = v0 + v2;
    float4 v6 = v0 + v3;
    float4 v7 = v1 + v2;
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        //float4 acc = local_a[i & (rate_local_mem_test_size) - 1];
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_rcp_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = v0 + v1;
    float4 v5 = v0 + v2;
    float4 v6 = v0 + v3;
    float4 v7 = v1 + v2;
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 = native_recip(v0);
        v1 = native_recip(v1);
        v2 = native_recip(v2);
        v3 = native_recip(v3);
        v4 = native_recip(v4);
        v5 = native_recip(v5);
        v6 = native_recip(v6);
        v7 = native_recip(v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_rsqrt_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = v0 + v1;
    float4 v5 = v0 + v2;
    float4 v6 = v0 + v3;
    float4 v7 = v1 + v2;
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 = native_rsqrt(v0);
        v1 = native_rsqrt(v1);
        v2 = native_rsqrt(v2);
        v3 = native_rsqrt(v3);
        v4 = native_rsqrt(v4);
        v5 = native_rsqrt(v5);
        v6 = native_rsqrt(v6);
        v7 = native_rsqrt(v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int64_add_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global ulong2 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    ulong2 v0 = local_a[masked_tid];
    ulong2 v1 = local_a[masked_tid + 1];
    ulong2 v2 = local_a[masked_tid + 2];
    ulong2 v3 = local_a[masked_tid + 3];
    ulong2 v4 = v0 + v1;
    ulong2 v5 = v0 + v2;
    ulong2 v6 = v0 + v3;
    ulong2 v7 = v1 + v2;
    ulong2 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int64_mul_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global ulong2 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    ulong2 v0 = local_a[masked_tid];
    ulong2 v1 = local_a[masked_tid + 1];
    ulong2 v2 = local_a[masked_tid + 2];
    ulong2 v3 = local_a[masked_tid + 3];
    ulong2 v4 = v0 + v1;
    ulong2 v5 = v0 + v2;
    ulong2 v6 = v0 + v3;
    ulong2 v7 = v1 + v2;
    ulong2 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
        v0 *= acc;
        v1 *= acc;
        v2 *= acc;
        v3 *= acc;
        v4 *= acc;
        v5 *= acc;
        v6 *= acc;
        v7 *= acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}


__kernel void mix_fp32_int32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    
    __local int4 local_a[rate_local_mem_test_size];
    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
        local_a[i] = convert_int4_sat(A[i]);
    barrier(CLK_LOCAL_MEM_FENCE); 

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = A[masked_tid];
    float4 v1 = A[masked_tid + 1];
    float4 v2 = A[masked_tid + 2];
    float4 v3 = A[masked_tid + 3];
    int4 v4 = convert_int4_sat(v0 + v1);
    int4 v5 = convert_int4_sat(v0 + v2);
    int4 v6 = convert_int4_sat(v0 + v3);
    int4 v7 = convert_int4_sat(v1 + v2);
    float4 fp_acc = A[0];

    for (int i = 0; i < count; i++) {
	int4 int_acc = local_a[i & (rate_local_mem_test_size - 1)];
        v0 += fp_acc;
        v1 += fp_acc;
        v2 += fp_acc;
        v3 += fp_acc;
        v4 += int_acc;
        v5 += int_acc;
        v6 += int_acc;
        v7 += int_acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);
}

__kernel void mix_fp32_int32_addmul_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *fp32_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = fp32_a[masked_tid];
    float4 v1 = fp32_a[masked_tid + 1];
    float4 v2 = fp32_a[masked_tid + 2];
    float4 v3 = fp32_a[masked_tid + 3];
    int4 v4 = convert_int4_sat(v0 + v1);
    int4 v5 = convert_int4_sat(v0 + v2);
    int4 v6 = convert_int4_sat(v0 + v3);
    int4 v7 = convert_int4_sat(v1 + v2);
    float4 fp_acc = fp32_a[0];
    int4 int_acc = convert_int4_sat(fp32_a[0]);

    for (int i = 0; i < count; i++) {
        v0 += fp_acc;
        v1 += fp_acc;
        v2 += fp_acc;
        v3 += fp_acc;
        v4 *= int_acc;
        v5 *= int_acc;
        v6 *= int_acc;
        v7 *= int_acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);
}

__kernel void fp32_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = local_a[masked_tid + 4];
    float4 v5 = local_a[masked_tid + 5];
    float4 v6 = local_a[masked_tid + 6];
    float4 v7 = local_a[masked_tid + 7];
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc * v0;
        v1 += acc * v1;
        v2 += acc * v2;
        v3 += acc * v3;
        v4 += acc * v4;
        v5 += acc * v5;
        v6 += acc * v6;
        v7 += acc * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_builtin_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = local_a[masked_tid + 4];
    float4 v5 = local_a[masked_tid + 5];
    float4 v6 = local_a[masked_tid + 6];
    float4 v7 = local_a[masked_tid + 7];
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
	v0 = fma(acc, v0, v0);
	v1 = fma(acc, v1, v1);
	v2 = fma(acc, v2, v2);
	v3 = fma(acc, v3, v3);
	v4 = fma(acc, v4, v4);
	v5 = fma(acc, v5, v5);
	v6 = fma(acc, v6, v6);
	v7 = fma(acc, v7, v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_mad_rate_test(__global float4 *A, int count, __global float4 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float4 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float4 v0 = local_a[masked_tid];
    float4 v1 = local_a[masked_tid + 1];
    float4 v2 = local_a[masked_tid + 2];
    float4 v3 = local_a[masked_tid + 3];
    float4 v4 = v0 + v1;
    float4 v5 = v0 + v2;
    float4 v6 = v0 + v3;
    float4 v7 = v1 + v2;
    float4 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        //float4 acc = local_a[i & (rate_local_mem_test_size) - 1];
        v0 = mad(acc, v0, v0);
        v1 = mad(acc, v1, v1);
        v2 = mad(acc, v2, v2);
        v3 = mad(acc, v3, v3);
        v4 = mad(acc, v4, v4);
        v5 = mad(acc, v5, v5);
        v6 = mad(acc, v6, v6);
        v7 = mad(acc, v7, v7);
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int16_add_rate_test(__global short8 *A, int count, __global short8 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    //__global short8 *local_a = A;

    __local short8 local_a[rate_local_mem_test_size];
    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE); 

    int masked_tid = min(tid, rate_local_mem_test_size - 8);
    short8 v0 = local_a[masked_tid];
    short8 v1 = local_a[masked_tid + 1];
    short8 v2 = local_a[masked_tid + 2];
    short8 v3 = local_a[masked_tid + 3];
    short8 v4 = local_a[masked_tid + 4];
    short8 v5 = local_a[masked_tid + 5];
    short8 v6 = local_a[masked_tid + 6];
    short8 v7 = local_a[masked_tid + 7];

    for (int i = 0; i < count; i++) {
	short8 acc = local_a[i & (rate_local_mem_test_size - 1)];
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int16_mul_rate_test(__global short8 *A, int count, __global short8 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    //__global short8 *local_a = A;

    __local short8 local_a[rate_local_mem_test_size];
    for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);  

    int masked_tid = min(tid, rate_local_mem_test_size - 8);
    short8 v0 = local_a[masked_tid];
    short8 v1 = local_a[masked_tid + 1];
    short8 v2 = local_a[masked_tid + 2];
    short8 v3 = local_a[masked_tid + 3];
    short8 v4 = local_a[masked_tid + 4];
    short8 v5 = local_a[masked_tid + 5];
    short8 v6 = local_a[masked_tid + 6]; 
    short8 v7 = local_a[masked_tid + 7];

    for (int i = 0; i < count; i++) {
	short8 acc = local_a[i & (rate_local_mem_test_size - 1)];
        v0 *= acc;
        v1 *= acc;
        v2 *= acc;
        v3 *= acc;
        v4 *= acc;
        v5 *= acc;
        v6 *= acc;
        v7 *= acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int8_add_rate_test(__global char16 *A, int count, __global char16 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global char16 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    char16 v0 = local_a[masked_tid];
    char16 v1 = local_a[masked_tid + 1];
    char16 v2 = local_a[masked_tid + 2];
    char16 v3 = local_a[masked_tid + 3];
    char16 v4 = v0 + v1;
    char16 v5 = v0 + v2;
    char16 v6 = v0 + v3;
    char16 v7 = v1 + v2;
    char16 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 += acc;
        v1 += acc;
        v2 += acc;
        v3 += acc;
        v4 += acc;
        v5 += acc;
        v6 += acc;
        v7 += acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int8_mul_rate_test(__global char16 *A, int count, __global char16 *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global char16 *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    char16 v0 = local_a[masked_tid];
    char16 v1 = local_a[masked_tid + 1];
    char16 v2 = local_a[masked_tid + 2];
    char16 v3 = local_a[masked_tid + 3];
    char16 v4 = v0 + v1;
    char16 v5 = v0 + v2;
    char16 v6 = v0 + v3;
    char16 v7 = v1 + v2;
    char16 acc = local_a[0];

    for (int i = 0; i < count; i++) {
        v0 *= acc;
        v1 *= acc;
        v2 *= acc;
        v3 *= acc;
        v4 *= acc;
        v5 *= acc;
        v6 *= acc;
        v7 *= acc;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_fma_latency_test(__global float *A, int count, __global float *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_add_latency_test(__global float *A, int count, __global float *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int32_add_latency_test(__global uint *A, int count, __global uint *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    uint v0 = A[masked_tid];
    uint v1 = A[masked_tid + 1];
    uint v2 = A[masked_tid + 2];
    uint v3 = A[masked_tid + 3];
    uint v4 = v0 + v1;
    uint v5 = v0 + v2;
    uint v6 = v0 + v3;
    uint v7 = v1 + v2;

    for (int i = 0; i < count; i += 4) {
        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int32_mul_latency_test(__global uint *A, int count, __global uint *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global uint *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    uint v0 = local_a[masked_tid];
    uint v1 = local_a[masked_tid + 1];
    uint v2 = local_a[masked_tid + 2];
    uint v3 = local_a[masked_tid + 3];
    uint v4 = v0 + v1;
    uint v5 = v0 + v2;
    uint v6 = v0 + v3;
    uint v7 = v1 + v2;
    uint acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_divergence_rate_test(__global float *A, int count, __global float *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = A[0];
    float op = A[get_global_id(0)];

    if (op < 1.0) {
        for (int i = 0; i < count; i++) {
            if (op < 0.5) {
                v0 += acc;
                v1 += acc;
                v2 += acc;
                v3 += acc;
                v4 += acc;
                v5 += acc;
                v6 += acc;
                v7 += acc;
            }
            else
            {
                v0 *= acc;
                v1 *= acc;
                v2 *= acc;
                v3 *= acc;
                v4 *= acc;
                v5 *= acc;
                v6 *= acc;
                v7 *= acc;
            }
        }
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_partition_rate_test(__global float *A, int count, __global float *ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float *local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = A[0];
    float op = A[get_global_id(0)];

    if (op < 1.0) {
        for (int i = 0; i < count; i++) {
            v0 += acc;
            v1 += acc;
            v2 += acc;
            v3 += acc;
            v4 += acc;
            v5 += acc;
            v6 += acc;
            v7 += acc;
        }
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

/// Scalar latency
__kernel void int32_add_scalar_latency_test(__global uint* A, int count, __global uint* ret) {
    int tid = 0;
    int max_offset = get_local_size(0);

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    uint v0 = A[masked_tid];
    uint v1 = A[masked_tid + 1];
    uint v2 = A[masked_tid + 2];
    uint v3 = A[masked_tid + 3];
    uint v4 = v0 + v1;
    uint v5 = v0 + v2;
    uint v6 = v0 + v3;
    uint v7 = v1 + v2;

    for (int i = 0; i < count; i += 4) {
        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void int32_mul_scalar_latency_test(__global uint* A, int count, __global uint* ret) {
    int tid = 0;
    int max_offset = get_local_size(0);
    __global uint* local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    uint v0 = local_a[masked_tid];
    uint v1 = local_a[masked_tid + 1];
    uint v2 = local_a[masked_tid + 2];
    uint v3 = local_a[masked_tid + 3];
    uint v4 = v0 + v1;
    uint v5 = v0 + v2;
    uint v6 = v0 + v3;
    uint v7 = v1 + v2;
    uint acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_add_scalar_latency_test(__global float* A, int count, __global float* ret) {
    int tid = 0;
    int max_offset = get_local_size(0);
    __global float* local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 8) {
        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;

        v0 = v7 + v0;
        v1 = v0 + v1;
        v2 = v1 + v2;
        v3 = v2 + v3;
        v4 = v3 + v4;
        v5 = v4 + v5;
        v6 = v5 + v6;
        v7 = v6 + v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_fma_scalar_latency_test(__global float* A, int count, __global float* ret) {
    int tid = 0;
    int max_offset = get_local_size(0);
    __global float* local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;

        v0 = v7 + acc * v0;
        v1 = v0 + acc * v1;
        v2 = v1 + acc * v2;
        v3 = v2 + acc * v3;
        v4 = v3 + acc * v4;
        v5 = v4 + acc * v5;
        v6 = v5 + acc * v6;
        v7 = v6 + acc * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_mul_scalar_latency_test(__global float* A, int count, __global float* ret) {
    int tid = 0;
    int max_offset = get_local_size(0);
    __global float* local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

__kernel void fp32_mul_latency_test(__global float* A, int count, __global float* ret) {
    int tid = get_local_id(0);
    int max_offset = get_local_size(0);
    __global float* local_a = A;

    int masked_tid = tid & (rate_local_mem_test_size - 1);
    float v0 = local_a[masked_tid];
    float v1 = local_a[masked_tid + 1];
    float v2 = local_a[masked_tid + 2];
    float v3 = local_a[masked_tid + 3];
    float v4 = v0 + v1;
    float v5 = v0 + v2;
    float v6 = v0 + v3;
    float v7 = v1 + v2;
    float acc = local_a[0];

    for (int i = 0; i < count; i += 4) {
        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;

        v0 = v7 * v0;
        v1 = v0 * v1;
        v2 = v1 * v2;
        v3 = v2 * v3;
        v4 = v3 * v4;
        v5 = v4 * v5;
        v6 = v5 * v6;
        v7 = v6 * v7;
    }

    ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}

================================================
FILE: GpuMemLatency/kernel.cl
================================================
// not used, I tried
__constant sampler_t direct_sampler = CLK_NORMALIZED_COORDS_FALSE | // coordinates are from 0 to max dimension size
                                        CLK_ADDRESS_NONE | // if it goes out of bounds feel free to explode and die
                                        CLK_FILTER_NEAREST;
__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {
    int localId = get_local_id(0);
    // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up
    int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;
    uint4 current = read_imageui(A, startPos);
    // printf("start x: %u -> %u\n", startPos, current.x);
    for (int i = 0; i < count; i += 10) {
        // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4);
        //current = read_imageui(A, direct_sampler, i);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w);
        // local_a[localId] = current;
    }

    ret[get_global_id(0)] = current.x;
}

__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)
                                        CLK_ADDRESS_REPEAT | // going out of bounds = replicate
                                        CLK_FILTER_NEAREST;
__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {
    int localId = get_local_id(0);
    float pos = get_global_id(0) * native_recip((float)get_global_size(0));
    float2 increment;
    increment.x = 0.01; // guessing
    increment.y = 0.01;

    float2 current0, current1, current2, current3;
    current0.x = pos;
    current0.y = pos;
    current1.x = 0.1 + (localId / 10000);
    current1.y = 0.1 + (localId / 10000);
    current2.x = 0.01 + (localId / 10000);
    current2.y = 0.01 + (localId / 10000);
    current3.x = 0.002 + (localId / 5000);
    current3.y = 0.001 + (localId / 5000);

    float4 tmp0 = read_imagef(A, funny_sampler, current0);
    float4 tmp1 = read_imagef(A, funny_sampler, current1);
    float4 tmp2 = read_imagef(A, funny_sampler, current2);
    float4 tmp3 = read_imagef(A, funny_sampler, current3);
    for (int i = 0; i < count; i += 4)
    {
        tmp0 += read_imagef(A, funny_sampler, current0);
        tmp1 += read_imagef(A, funny_sampler, current1);
        tmp2 += read_imagef(A, funny_sampler, current2);
        tmp3 += read_imagef(A, funny_sampler, current3);
        current0 += increment;
        current1 += increment;
        current2 += increment;
        current3 += increment;
    }

    *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);
}

// Cacheline size in bytes, must correspond to what's defined for the latency test
#define CACHELINE_SIZE 64

// unrolled until terascale no longer saw further improvement (10x unroll)
// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
// not divisible by 10
__kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) {
    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency
    int result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

// Ensures the loaded value will be constant across a workgroup
__kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
    int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];
    int result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

// Takes size as an additional argument, meant to run many pointer chasing threads in parallel
// Tries to measure a GPU's latency hiding ability at varying levels of parallelism
__kernel void parallel_latency_test(__global const int* A, int count, int size, __global int* ret) {
    size_t threadId = get_global_id(0);
    int current = A[threadId % size];
    int result = 0;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[threadId] = result;
}

// latency test like the unrolled one above, but with input as constant memory
__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
    //int current = A[0];
    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];
    int result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

#define local_mem_test_size 1024
// uses local memory (LDS/shmem)
__kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
    __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
    // better be fast
    for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // everyone else can chill/get masked off
    if (get_local_id(0) == 0) {
        int current = local_a[0];
        int result;
        for (int i = 0; i < count; i += 10) {
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
        }

        ret[0] = result;
    }
}


__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float4 result1 = (0.1f,0.2f,0.3f,0.4f);
    float4 result2 = (1.1f,1.2f,1.3f,1.4f);
    float4 result3 = (2.1f,2.2f,2.3f,2.4f);
    float4 result4 = (3.0f,3.1f,3.2f,3.3f);
    float4 result5 = (4.0f,4.2f,4.1f,4.3f);

    int initialIdx = startPositions[threadId];
    //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);
    //startPositions[threadId] = initialIdx; // for debugging

    int idx = initialIdx;
    __global float4 *B = (__global float4 *)A;
    for (int i = 0; i < count; i += 20) {
        result1 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result2 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result3 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result4 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result5 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;
    }

    ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);
}

#define local_mem_bw_test_size 1024
// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats
__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {
 __local float local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float acc1 = 1.1;
    float acc2 = 2.2;
    float acc3 = 3.3;
    float acc4 = 4.4;

    //printf("subgroup size %d\n", get_sub_group_size());

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += 12) { 
        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];
        acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];
        acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    ret[threadId] = acc1 + acc2 + acc3 + acc4;
}

__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {
    __local float4 local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float4 acc1 = A[get_global_id(0) & 0x3FF];
    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += (12*4)) { 
        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];
        acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];
        acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);
}


#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B
__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { 
    __local ulong local_a[local64_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;

    // assumes local memory size is at least 512x 64-bit uints
    int idx0 = localId;
    int idx1 = localId + localSize;
    for (int i = 0; i < count; i += 8) { 
        acc0 ^= local_a[idx0];
        acc1 ^= local_a[idx1];
        acc2 ^= local_a[idx0 + 1];
        acc3 ^= local_a[idx1 + 1];
        idx0 = (idx0 + localSize) & 0x1FF;
        idx1 = (idx1 + localSize) & 0x1FF;
    }

    ret[threadId] = acc0 + acc1 + acc2 + acc3;
}

// let's try the method from zhe jia et al
__kernel void local_chase_bw(__global uint* A, uint count, __global uint* ret) {
    __local ulong local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    uint sink = localId;

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    for (int i = 0; i < count; i += 4)
    {
        sink = local_a[sink];
        sink = local_a[sink];
        sink = local_a[sink];
        sink = local_a[sink];
    }

    ret[threadId] = sink;
}

#define fixed_tex_test_size 1024
__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    uint4 acc1 = read_imageui(A, 0);
    uint4 acc2 = read_imageui(A, 1);
    uint4 acc3 = read_imageui(A, 2);
    uint4 acc4 = read_imageui(A, 3);

    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;

    // Each read_imageui reads out a 4-wide vector
    for (int i = 0; i < count; i += 16) {
        read_imageui(A, idx0);
        acc1 += read_imageui(A, idx0);
        acc2 += read_imageui(A, idx1);
        acc3 += read_imageui(A, idx2);
        acc4 += read_imageui(A, idx0 + 1);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    float4 out1 = convert_float4(acc1);
    float4 out2 = convert_float4(acc2);
    float4 out3 = convert_float4(acc3);
    float4 out4 = convert_float4(acc4);
    ret[threadId] = dot(out1, out2) + dot(out3, out4);
}

// A = inputs, fixed size
__kernel void int_exec_latency_test(__global int* A, int count, __global int* ret) {
    int sum = 0;
    int input1 = A[0], input2 = A[1], input3 = A[2], input4 = A[3];
    for (int i = 0; i < count; i++) {
        sum += input1;
        sum += input2;
        sum += input3;
        sum += input4;
        sum += input1;
        sum += input2;
        sum += input3;
        sum += input4;
        sum += input1;
        sum += input2;
        sum += input3;
        sum += input4;
    }
}

// hoping each thread/workgroup lands on a different CU
// A = pointer to location being bounced around
// count = iterations
// ret = sink
// t1 = id of thread 1
// t2 = id of thread 2
__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {
    int global_id = get_global_id(0);
    int current = 0;
    if (global_id == t1) current = 1;
    else if (global_id == t2) current = 2;

    if (global_id == t1 || global_id == t2) {
        //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current);
        while (current <= 2 * count) {
            if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
                current += 2;
            }
        }
        ret[0] = current;
    }
}

__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {
    int current = get_global_id(0) + 1;
    while (current <= 2 * count) {
        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
            current += 2;
        }
    }
}

__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {
    __local int a[1];
    int current = get_global_id(0) + 1;
    if (current == 1) a[0] = A[0];
    barrier(CLK_LOCAL_MEM_FENCE);

    while (current <= 2 * count) {
        if (atomic_cmpxchg(a, current - 1, current) == current - 1) {
            current += 2;
        }
    }
}

__kernel void dummy_add(__global int* A) {
    A[get_global_id(0)]++;
}

================================================
FILE: GpuMemLatency/kernels/atomic_exec_latency_test.cl
================================================
__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {
    int current = get_global_id(0) + 1;
    while (current <= 2 * count) {
        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
            current += 2;
        }
    }
}

__kernel void atomic_add_test(__global int *A, int count) {
    int addend = get_global_id(0);
    int addend1 = addend + 5;
    int addend2 = addend + 6;
    int addend3 = addend + 7;
    int addend4 = addend + 8;
    int addend5 = addend + 9;
    int addend6 = addend + 10;
    int addend7 = addend + 11;
    __global int *target = A + get_global_id(0);
    for (int i = 0; i < count; i++)
    {
        atomic_add(target, addend);
        atomic_add(target, addend1);
        atomic_add(target, addend2);
        atomic_add(target, addend3);
        atomic_add(target, addend4);
        atomic_add(target, addend5);
        atomic_add(target, addend6);
        atomic_add(target, addend7);
    }
}

================================================
FILE: GpuMemLatency/kernels/buffer_bw_test.cl
================================================
#define fixed_tex_test_size 1024
__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    uint4 acc1 = read_imageui(A, 0);
    uint4 acc2 = read_imageui(A, 1);
    uint4 acc3 = read_imageui(A, 2);
    uint4 acc4 = read_imageui(A, 3);

    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;

    // Each read_imageui reads out a 4-wide vector
    for (int i = 0; i < count; i += 16) {
        read_imageui(A, idx0);
        acc1 += read_imageui(A, idx0);
        acc2 += read_imageui(A, idx1);
        acc3 += read_imageui(A, idx2);
        acc4 += read_imageui(A, idx0 + 1);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    float4 out1 = convert_float4(acc1);
    float4 out2 = convert_float4(acc2);
    float4 out3 = convert_float4(acc3);
    float4 out4 = convert_float4(acc4);
    ret[threadId] = dot(out1, out2) + dot(out3, out4);
}

================================================
FILE: GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl
================================================
// hoping each thread/workgroup lands on a different CU
// A = pointer to location being bounced around
// count = iterations
// ret = sink
// t1 = id of thread 1
// t2 = id of thread 2
__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {
    int global_id = get_global_id(0);
    int current = 0;
    if (global_id == t1) current = 1;
    else if (global_id == t2) current = 2;

    if (global_id == t1 || global_id == t2) {
        //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current);
        while (current <= 2 * count) {
            if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
                current += 2;
            }
        }
        ret[0] = current;
    }
}

================================================
FILE: GpuMemLatency/kernels/constant_unrolled_latency_test.cl
================================================
// latency test like the unrolled one above, but with input as constant memory
__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
    //int current = A[0];
    int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];
    int result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

================================================
FILE: GpuMemLatency/kernels/ldst_bw_test.cl
================================================
#define ldst_bw_test_size 1024
// test load/store bandwidth with a small test size that should fit in L1
/*__kernel void ldst_bw_test(__global float* A, uint count, __global float* ret) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float acc1 = 1.1;
    float acc2 = 2.2;
    float acc3 = 3.3;
    float acc4 = 4.4;

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += 12) { 
        acc1 += A[idx0] * A[idx1] + A[idx2];
        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);

        acc2 += A[idx0] * A[idx1] + A[idx2];
        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);

        acc3 += A[idx0] * A[idx1] + A[idx2];
        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);

        acc4 += A[idx0] * A[idx1] + A[idx2];
        idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
        idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
        idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);
    }

    ret[threadId] = acc1 + acc2 + acc3 + acc4;
}*/

__kernel void ldst_bw_test(__global float4* A, uint count, __global float* ret) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float acc1 = 1.1;
    float acc2 = 2.2;
    float acc3 = 3.3;
    float acc4 = 4.4;

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = idx0 + localSize;
    int idx2 = idx1 + localSize;
    int idx3 = idx2 + localSize;
    for (int i = 0; i < count; i += (16*4)) { 
        acc1 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
        idx3 = (idx3 + localSize) & 0x3FF;

        acc2 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
        idx3 = (idx3 + localSize) & 0x3FF;

        acc3 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
        idx3 = (idx3 + localSize) & 0x3FF;

        acc4 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
        idx3 = (idx3 + localSize) & 0x3FF;
    }

    ret[threadId] =  acc1 + acc2 + acc3 + acc4;
}

================================================
FILE: GpuMemLatency/kernels/local_64_bw_test.cl
================================================
#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B
__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { 
    __local ulong local_a[local64_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;

    // assumes local memory size is at least 512x 64-bit uints
    int idx0 = localId;
    int idx1 = localId + localSize;
    for (int i = 0; i < count; i += 8) { 
        acc0 ^= local_a[idx0];
        acc1 ^= local_a[idx1];
        idx0 = (idx0 + localSize) & 0x1FF;
        idx1 = (idx1 + localSize) & 0x1FF;

        acc3 ^= local_a[idx0];
        acc4 ^= local_a[idx1];
        idx0 = (idx0 + localSize) & 0x1FF;
        idx1 = (idx1 + localSize) & 0x1FF;
    }

    ret[threadId] = acc0 + acc1 + acc2 + acc3;
}


================================================
FILE: GpuMemLatency/kernels/local_atomic_latency_test.cl
================================================
__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {
    __local int a[1];
    int current = get_global_id(0) + 1;
    if (current == 1) a[0] = A[0];
    barrier(CLK_LOCAL_MEM_FENCE);

    while (current <= 2 * count) {
        if (atomic_cmpxchg(a, current - 1, current) == current - 1) {
            current += 2;
        }
    }
}

#define local_atomic_add_wg_size 256
__kernel void local_atomic_add_test(__global int *A, int count) {
    __local int local_a[local_atomic_add_wg_size];
    local_a[get_local_id(0)] = A[get_global_id(0)];
    barrier(CLK_LOCAL_MEM_FENCE);

    int addend = get_global_id(0);
    int addend1 = addend + 5;
    int addend2 = addend + 6;
    int addend3 = addend + 7;
    int addend4 = addend + 8;
    int addend5 = addend + 9;
    int addend6 = addend + 10;
    int addend7 = addend + 11;
    __local int *target = local_a + get_local_id(0);
    for (int i = 0; i < count; i++)
    {
        atomic_add(target, addend);
        atomic_add(target, addend1);
        atomic_add(target, addend2);
        atomic_add(target, addend3);
        atomic_add(target, addend4);
        atomic_add(target, addend5);
        atomic_add(target, addend6);
        atomic_add(target, addend7);
    }

    A[get_global_id(0)] = local_a[get_local_id(0)];
}

================================================
FILE: GpuMemLatency/kernels/local_bw_test.cl
================================================
#define local_mem_bw_test_size 1024
// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats
__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {
 __local float local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float acc1 = 1.1;
    float acc2 = 2.2;
    float acc3 = 3.3;
    float acc4 = 4.4;

    //printf("subgroup size %d\n", get_sub_group_size());

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += 12) { 
        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    ret[threadId] = acc1 + acc2 + acc3 + acc4;
}

================================================
FILE: GpuMemLatency/kernels/local_float4_bw_test.cl
================================================
#define local_mem_bw_test_size 1024
__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {
    __local float4 local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float4 acc1 = A[get_global_id(0) & 0x3FF];
    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0); i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += (12 * 4)) {
        acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);
}

__kernel void mixed_float4_bw_test(__global float4* A, uint count, __global float* ret) {
    __local float4 local_a[local_mem_bw_test_size];
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float4 acc1 = A[get_global_id(0) & 0x3FF];
    float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
    float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
    float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];
    float4 acc5 = A[(get_global_id(0) + 4) & 0x3FF];
    float4 acc6 = A[(get_global_id(0) + 5) & 0x3FF];
    float4 acc7 = A[(get_global_id(0) + 6) & 0x3FF];
    float4 acc8 = A[(get_global_id(0) + 7) & 0x3FF];

    // workgroup-wide copy from global mem into local mem
    for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // assumes local memory size is at least 1024 float4s
    int idx0 = localId;
    int idx1 = localId + localSize;
    int idx2 = localId + localSize * 2;
    for (int i = 0; i < count; i += (16*4)) {
        local_a[idx0] += A[idx1] * A[idx2]; // 4 * (3R 1W)
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        local_a[idx0] += A[idx1] * A[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        local_a[idx0] += A[idx1] * A[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;

        local_a[idx0] += A[idx1] * A[idx2];
        idx0 = (idx0 + localSize) & 0x3FF;
        idx1 = (idx1 + localSize) & 0x3FF;
        idx2 = (idx2 + localSize) & 0x3FF;
    }

    ret[threadId] = dot(local_a[get_local_id(0)], local_a[get_local_id(0) + 1]);
}

================================================
FILE: GpuMemLatency/kernels/local_unrolled_latency_test.cl
================================================
#define local_mem_test_size 1024
// uses local memory (LDS/shmem)
__kernel void local_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
    __local uint local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
    // better be fast
    for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
        local_a[i] = A[i];
    barrier(CLK_LOCAL_MEM_FENCE);

    // everyone else can chill/get masked off
    if (get_local_id(0) == 0) {
        uint current = local_a[0];
        uint result;
        for (int i = 0; i < count; i += 10) {
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
            result += current;
            current = local_a[current];
        }

        ret[0] = result;
    }
}

================================================
FILE: GpuMemLatency/kernels/scalar_unrolled_latency_test.cl
================================================
// Ensures the loaded value will be constant across a workgroup
__kernel void scalar_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
    uint current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];
    uint result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

================================================
FILE: GpuMemLatency/kernels/sum_bw_test.cl
================================================
__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {
    int threadId = get_global_id(0);
    int localId = get_local_id(0);
    int localSize = get_local_size(0);
    int groupId = get_group_id(0);
    float4 result1 = (0.1f,0.2f,0.3f,0.4f);
    float4 result2 = (1.1f,1.2f,1.3f,1.4f);
    float4 result3 = (2.1f,2.2f,2.3f,2.4f);
    float4 result4 = (3.0f,3.1f,3.2f,3.3f);
    float4 result5 = (4.0f,4.2f,4.1f,4.3f);

    int initialIdx = startPositions[threadId];
    //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);
    //startPositions[threadId] = initialIdx; // for debugging

    int idx = initialIdx;
    __global float4 *B = (__global float4 *)A;
    for (int i = 0; i < count; i += 20) {
        result1 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result2 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result3 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result4 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;

        result5 += B[idx];
        idx += localSize;
        if (idx >= float4size) idx = initialIdx;
    }

    ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);
}

================================================
FILE: GpuMemLatency/kernels/tex_bw_test.cl
================================================
__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)
                                        CLK_ADDRESS_REPEAT | // going out of bounds = replicate
                                        CLK_FILTER_NEAREST;
__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {
    int localId = get_local_id(0);
    float pos = get_global_id(0) * native_recip((float)get_global_size(0));
    float2 increment;
    increment.x = 0.01; // guessing
    increment.y = 0.01;

    float2 current0, current1, current2, current3;
    current0.x = pos;
    current0.y = pos;
    current1.x = 0.1 + (localId / 10000);
    current1.y = 0.1 + (localId / 10000);
    current2.x = 0.01 + (localId / 10000);
    current2.y = 0.01 + (localId / 10000);
    current3.x = 0.002 + (localId / 5000);
    current3.y = 0.001 + (localId / 5000);

    float4 tmp0 = read_imagef(A, funny_sampler, current0);
    float4 tmp1 = read_imagef(A, funny_sampler, current1);
    float4 tmp2 = read_imagef(A, funny_sampler, current2);
    float4 tmp3 = read_imagef(A, funny_sampler, current3);
    for (int i = 0; i < count; i += 4)
    {
        tmp0 += read_imagef(A, funny_sampler, current0);
        tmp1 += read_imagef(A, funny_sampler, current1);
        tmp2 += read_imagef(A, funny_sampler, current2);
        tmp3 += read_imagef(A, funny_sampler, current3);
        current0 += increment;
        current1 += increment;
        current2 += increment;
        current3 += increment;
    }

    *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);
}

================================================
FILE: GpuMemLatency/kernels/tex_latency_test.cl
================================================
__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {
    int localId = get_local_id(0);
    // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up
    int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;
    uint4 current = read_imageui(A, startPos);
    // printf("start x: %u -> %u\n", startPos, current.x);
    for (int i = 0; i < count; i += 10) {
        // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4);
        //current = read_imageui(A, direct_sampler, i);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        current = read_imageui(A, current.x);
        //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w);
        // local_a[localId] = current;
    }

    ret[get_global_id(0)] = current.x;
}


================================================
FILE: GpuMemLatency/kernels/unrolled_latency_test.cl
================================================
// unrolled until terascale no longer saw further improvement (10x unroll)
// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
// not divisible by 10
__kernel void unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
    uint current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency
    uint result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
    }

    ret[0] = result;
}

================================================
FILE: GpuMemLatency/latency_test.c
================================================
#include "opencltest.h"

// list_size = number of 4B (32-bit) elements
float latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t list_size,
    uint32_t chase_iterations,
    short uniform,
    int threads,
    int local_size,
    int wave_size,
    int stride,
    uint32_t *elapsed_ms)
{
    size_t global_item_size = 1, local_item_size = 1;
    cl_int ret;
    float latency;
    int64_t time_diff_ms;
    uint32_t result;

    if (threads && local_size)
    {
        local_item_size = local_size;
        global_item_size = threads;
    }

    // fprintf(stderr, "Testing latency with %d threads %d local size %d list size\n", threads, local_size, list_size);

    // Sanity Checks
    if (!uniform && ((stride * 2 > list_size * 4) || // 2 cache lines
        ((threads > 1) && (stride * 2 > (list_size * 4 / (threads / wave_size)))))) // handle partition case
    {
        fprintf(stderr, "Less than 2 lines will be visited with stride %d, list size %dx 32-bit INTs\n", stride, list_size);
        return 1.0f;
    }

    // Fill pattern arr
    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);
    uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));
    memset(A, 0, sizeof(uint32_t) * list_size);
    if (threads < 2 || uniform) {
        FillPatternArr(A, list_size, stride);
        thread_start[0] = 0;
    }
    else
    {
        if (wave_size <= 1) wave_size = 1;

        // partition pattern arr, creating a section for each wave
        int wave_count = threads / wave_size;
        int sub_list_size = list_size / wave_count;
        for (int waveId = 0; waveId < wave_count; waveId++)
        {
            int waveId_start = sub_list_size * waveId;
            thread_start[wave_size * waveId] = waveId_start;
            FillPatternArr(A + waveId_start, sub_list_size, stride);
            // fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start);

            // offset indices
            for (int subIdx = 0; subIdx < sub_list_size; subIdx++)
            {
                A[waveId_start + subIdx] += waveId_start;
            }
        }

        // make sure all threads in a wave access the same item
        for (int i = 1; i < threads; i++)
        {
            int waveId = i / wave_size;
            thread_start[i] = thread_start[waveId * wave_size];
            //fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]);
        }
    }

    // copy array to device
    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);
    clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);

    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);
    clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
    clFinish(command_queue);

    // Set kernel arguments
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to set list as kernel arg. clSetKernelArg returned %d\n", ret);
        latency = 0;
        goto cleanup;
    }

    ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);

    start_timing();
    // Execute the OpenCL kernel. launch a single thread
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        latency = 0;
        goto cleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        latency = 0;
        goto cleanup;
    }

    time_diff_ms = end_timing();
    if (elapsed_ms != NULL) *elapsed_ms = time_diff_ms;
    latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t), &result, 0, NULL, NULL);
    clFinish(command_queue);

    //fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    free(A);
    return latency;
}

float tex_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t list_size,
    uint32_t chase_iterations,
    int threads,
    int local_size,
    int wave_size)
{
    size_t global_item_size = 1, local_item_size = 1;
    cl_int ret = 0;
    uint32_t result;
    cl_mem a_mem_obj = NULL, result_obj = NULL, tex_obj = NULL;
    float latency = 0;

    if (threads > 1)
    {
        global_item_size = threads;
        local_item_size = local_size;
    }

    uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);
    uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));
    memset(A, 0, sizeof(uint32_t) * list_size);
    if (threads < 2) {
        FillPatternArr(A, list_size, CACHELINE_SIZE);
        thread_start[0] = 0;
    }
    else
    {
        if (wave_size <= 1) wave_size = 1;

        // partition pattern arr, creating a section for each wave
        int wave_count = threads / wave_size;
        int sub_list_size = list_size / wave_count;
        for (int waveId = 0; waveId < wave_count; waveId++)
        {
            int waveId_start = sub_list_size * waveId;
            thread_start[wave_size * waveId] = waveId_start;
            FillPatternArr(A + waveId_start, sub_list_size, CACHELINE_SIZE);
            // fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start);

            // offset indices
            for (int subIdx = 0; subIdx < sub_list_size; subIdx++)
            {
                A[waveId_start + subIdx] += waveId_start;
            }
        }

        // make sure all threads in a wave access the same item
        for (int i = 1; i < threads; i++)
        {
            int waveId = i / wave_size;
            thread_start[i] = thread_start[waveId * wave_size];
            //fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]);
        }
    }

    // use buffer as texture
    a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);
    clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);
    clFinish(command_queue);
    cl_image_format imageFormat;
    imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
    imageFormat.image_channel_order = CL_R;

    cl_image_desc imageDesc;
    memset(&imageDesc, 0, sizeof(cl_image_desc));
    imageDesc.buffer = a_mem_obj;
    imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
    imageDesc.image_width = list_size; // width in pixels
    //imageDesc.image_height = 1; // not used for 1D image
    //imageDesc.image_depth = 1;  // not used for 1D image
    //imageDesc.mem_object = a_mem_obj;
    tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to create image: %d\n", ret);
        goto texLatencyCleanup;
    }

    size_t origin[] = { 0, 0, 0 };
    size_t region[] = { imageDesc.image_width, 1, 1 };
    ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to copy image: %d\n", ret);
        goto texLatencyCleanup;
    }
    
    result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);
    clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
    clFinish(command_queue);

    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);
    ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
    ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&list_size);

    start_timing();
    // Execute the OpenCL kernel
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        latency = 0;
        goto texLatencyCleanup;
    }

    ret = clFinish(command_queue); // returns success even when TDR happens?
    if (ret != CL_SUCCESS)
    {
        printf("Failed to finish command queue. clFinish returned %d\n", ret);
        latency = 0;
        goto texLatencyCleanup;
    }

    uint64_t time_diff_ms = end_timing();
    latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;

    ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
    clFinish(command_queue);

    // for (int i = 0; i < global_item_size; i++) fprintf(stderr, "Thread %d ended at %d\n", i, thread_start[i]);

texLatencyCleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(tex_obj);
    clReleaseMemObject(result_obj);
    free(A);
    return latency;
}

================================================
FILE: GpuMemLatency/local_mem_latency_kernel.cl
================================================
// for testing total local memory capacity by seeing when threads can no longer overlap in time
// due to local mem capacity limits across the GPU
// calling code expected to define LATENCY_LOCAL_MEM_SIZE
__kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) {
    __local int local_a[LATENCY_LOCAL_MEM_SIZE];
    int start = A[0]; // this will test scalar latency, always
    int current = A[start];
    int result;
    for (int i = 0; i < count; i += 10) {
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        result += current;
        current = A[current];
        local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current;
    }

    ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)];
}


================================================
FILE: GpuMemLatency/opencltest.c
================================================
#include "opencltest.h"

// default test sizes for latency, in KB
int default_test_sizes[] = { 1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 144, 160, 172, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, 
    8192, 16384, 18432, 20480, 24576, 25600, 28672, 32768, 36864, 40960, 41200, 49152, 65536, 98304, 131072, 196608, 262144, 524288, 768432,  819200, 921600, 1048576 };

// lining this up with nemes's VK bw test sizes. units for this one are in bytes
const uint64_t default_bw_test_sizes[] = {
    4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688, 131072,
        196608, 262144, 393216, 458752, 524288, 786432, 1048576, 1572864, 2097152, 3145728, 4194304, 6291456, 8388608, 12582912, 16777216, 20971520,
        25165824, 33554432, 37748736, 41943040, 50331648, 58720256, 67108864, 100663296, 134217728, 201326592, 268435456, 402653184, 536870912, 805306368,
        1073741824, 1610579968, 2147483648, 3221225472, 4294967296
};

float int_exec_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations);

uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb);
uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations);

cl_ulong get_max_buffer_size();
cl_ulong get_max_constant_buffer_size();

enum TestType {
    VectorMemLatency,
    ScalarMemLatency,
    ConstantMemLatency,
    LocalMemCapacity,
    LocalMemLatency,
    TexMemLatency,
    GlobalAtomicLatency,
    LocalAtomicLatency,
    GlobalAtomicAdd,
    LocalAtomicAdd,
    GlobalMemBandwidth,
    LocalMemBandwidth,
    LocalMemChaseBandwidth,
    LocalMem64Bandwidth,
    LocalMemFloat4Bandwidth,
    MixedFloat4Bandwidth,
    LoadStoreBandwidth,
    TextureThroughput,
    BufferBandwidth,
    MemBandwidthWorkgroupScaling,
    CoreToCore,
    LinkBandwidth,
    InstructionRate,
    Divergence,
    Partition,
    MemDivergence
};


int main(int argc, char* argv[]) {
    cl_int ret;
    uint32_t stride = 64;
    uint32_t list_size = 3840 * 2160 * 4;
    uint32_t chase_iterations = 1e6 * 7;
    // skip = 0 means auto
    uint32_t thread_count = 1, local_size = 1, skip = 0, wave = 0;
    float result;
    int platform_index = -1, device_index = -1;
    enum TestType testType = VectorMemLatency;
    char thread_count_set = 0, local_size_set = 0, chase_iterations_set = 0, skip_set = 0;
    int sizeKb = 0;
    int forceCuCount = 0;
    int forcefp16 = 0, forcefp64 = 0;

    // vars for local mem capacity testing
    int local_mem_size_kb = 0; // local mem allocated for each wg
    int group_count = 0;       // max wg count

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "stride", 6) == 0) {
                argIdx++;
                stride = atoi(argv[argIdx]);
                fprintf(stderr, "Using stride = %u\n", stride);
            }
            else if (_strnicmp(arg, "iterations", 10) == 0) {
                argIdx++;
                chase_iterations = atoi(argv[argIdx]);
                chase_iterations_set = 1;
                fprintf(stderr, "Using %u iterations\n", chase_iterations);
            }
            else if (_strnicmp(arg, "threads", 7) == 0) {
                argIdx++;
                thread_count = atoi(argv[argIdx]);
                thread_count_set = 1;
                fprintf(stderr, "Using %u threads\n", thread_count);
            }
            else if (_strnicmp(arg, "localsize", 9) == 0) {
                argIdx++;
                local_size = atoi(argv[argIdx]);
                local_size_set = 1;
                fprintf(stderr, "Using local size = %u\n", local_size);
            }
            else if (_strnicmp(arg, "wave", 4) == 0) {
                argIdx++;
                wave = atoi(argv[argIdx]);
                fprintf(stderr, "Estimated wave size = %u\n", wave);
            }
            else if (_strnicmp(arg, "platform", 8) == 0) {
                argIdx++;
                platform_index = atoi(argv[argIdx]);
                fprintf(stderr, "Using OpenCL platform index %d\n", platform_index);
            }
            else if (_strnicmp(arg, "device", 6) == 0) {
                argIdx++;
                device_index = atoi(argv[argIdx]);
                fprintf(stderr, "Using OpenCL device index %d\n", device_index);
            }
            else if (_strnicmp(arg, "bwskip", 6) == 0) {
                argIdx++;
                skip = atoi(argv[argIdx]);
                fprintf(stderr, "Workgroups will be spaced %u apart\n", skip);
            }
            else if (_strnicmp(arg, "sizekb", 6) == 0) {
                argIdx++;
                sizeKb = atoi(argv[argIdx]);
                fprintf(stderr, "Only testing %d KB\n", sizeKb);
            }
            else if (_strnicmp(arg, "localmemsize", 12) == 0)
            {
                argIdx++;
                local_mem_size_kb = atoi(argv[argIdx]);
                fprintf(stderr, "Testing with %d of local memory allocated per WG\n", local_mem_size_kb);
            }
            else if (_strnicmp(arg, "groupcount", 10) == 0)
            {
                argIdx++;
                group_count = atoi(argv[argIdx]);
                fprintf(stderr, "Testing with up to %d WGs\n", group_count);
            }
            else if (_strnicmp(arg, "saveprogram", 11) == 0) {
                saveprogram = 1;
                fprintf(stderr, "Writing compiled program to disk\n");
            }
            else if (_strnicmp(arg, "forcefp16", 10) == 0) {
                forcefp16 = 1;
                fprintf(stderr, "For instruction rate testing, will run FP16 tests regardless of whether support is advertised\n");
            }
            else if (_strnicmp(arg, "forcefp64", 10) == 0) {
                forcefp64 = 1;
                fprintf(stderr, "For instruction rate testing, will run FP64 tests regardless of whether support is advertised\n");
            }
            else if (_strnicmp(arg, "test", 4) == 0) {
                argIdx++;
                if (_strnicmp(argv[argIdx], "vectorlatency", 13) == 0) {
                    testType = VectorMemLatency;
                    fprintf(stderr, "Testing global memory latency, vector accesses\n");
                }
                else if (_strnicmp(argv[argIdx], "scalarlatency", 13) == 0) {
                    testType = ScalarMemLatency;
                    fprintf(stderr, "Testing global memory latency, scalar accesses\n");
                }
                else if (_strnicmp(argv[argIdx], "constantlatency", 15) == 0) {
                    testType = ConstantMemLatency;
                    fprintf(stderr, "Testing constant memory latency\n");
                }
                else if (_strnicmp(argv[argIdx], "memdivergence", 13) == 0) {
                    testType = MemDivergence;
                    fprintf(stderr, "Testing memory access divergence cost\n");
                }
                else if (_strnicmp(argv[argIdx], "localmemcapacity", 16) == 0) {
                    testType = LocalMemCapacity;
                    fprintf(stderr, "Testing GPU-wide local memory capacity. Make sure localmemsize/groupcount are set appropriately!\n");

                    if (sizeKb == 0) sizeKb = 1;
                    if (group_count == 0) group_count = 16;
                }
                else if (_strnicmp(argv[argIdx], "globalatomiccmpxchg", 19) == 0) {
                    testType = GlobalAtomicLatency;
                    fprintf(stderr, "Testing global atomic latency (cmpxchg)\n");
                }
                else if (_strnicmp(argv[argIdx], "globalatomicadd", 15) == 0)
                {
                    testType = GlobalAtomicAdd;
                    fprintf(stderr, "Testing global atomic add\n");
                }
                else if (_strnicmp(argv[argIdx], "locallatency", 13) == 0) {
                    testType = LocalMemLatency;
                    fprintf(stderr, "Testing local mem latency\n");
                }
                else if (_strnicmp(argv[argIdx], "texlatency", 10) == 0) {
                    testType = TexMemLatency;
                    fprintf(stderr, "Testing texture mem latency\n");
                }
                else if (_strnicmp(argv[argIdx], "localatomiccmpxchg", 18) == 0) {
                    testType = LocalAtomicLatency;
                    fprintf(stderr, "Testing local atomic latency (cmpxchg)\n");
                }
                else if (_strnicmp(argv[argIdx], "localatomicadd", 14) == 0) {
                    testType = LocalAtomicAdd;
                    fprintf(stderr, "Testing local atomic add\n");
                }
                else if (_strnicmp(argv[argIdx], "bw", 2) == 0) {
                    testType = GlobalMemBandwidth;
                    fprintf(stderr, "Testing global memory bandwidth\n");

                    // Somewhat reasonable defaults
                    if (!thread_count_set) thread_count = 131072;
                    if (!local_size_set) local_size = 256;
                    if (!chase_iterations_set) chase_iterations = 500000;
                }
                else if (_strnicmp(argv[argIdx], "localbw", 7) == 0) {
                    testType = LocalMemBandwidth;
                    if (!thread_count_set) thread_count = 262144;
                    if (!local_size_set) local_size = 256;
                    fprintf(stderr, "Testing local memory bandwidth\n");
                }
                else if (_strnicmp(argv[argIdx], "localchasebw", 12) == 0) {
                    testType = LocalMemChaseBandwidth;
                    fprintf(stderr, "Testing local memory bandwidth using pointer chasing and lots of waves\n");
                }
                else if (_strnicmp(argv[argIdx], "local64bw", 9) == 0) {
                    testType = LocalMem64Bandwidth;
                    fprintf(stderr, "Testing local memory bandwidth using 64-bit loads\n");
                }
                else if (_strnicmp(argv[argIdx], "localfloat4bw", 13) == 0) {
                    testType = LocalMemFloat4Bandwidth;
                    fprintf(stderr, "Testing local memory bandwidth using float4 (4x32-bit) loads\n");
                }
                else if (_strnicmp(argv[argIdx], "mixedbw", 7) == 0) {
                    testType = MixedFloat4Bandwidth;
                    fprintf(stderr, "Mixed local/global load bw test with float4\n");
                }
                else if (_strnicmp(argv[argIdx], "bufferbw", 8) == 0) {
                    testType = BufferBandwidth;
                    fprintf(stderr, "Testing buffer bandwidth\n");
                }
                else if (_strnicmp(argv[argIdx], "ldstbw", 6) == 0) {
                    testType = LoadStoreBandwidth;
                    fprintf(stderr, "Testing load/store bandwidth\n");
                }
                else if (_strnicmp(argv[argIdx], "scaling", 7) == 0)
                {
                    testType = MemBandwidthWorkgroupScaling;
                    fprintf(stderr, "Testing BW scaling with workgroups\n");
                    if (!chase_iterations_set) chase_iterations = 20000000;

                    if (argIdx + 1 < argc && argv[argIdx + 1][0] != '-')
                    {
                        argIdx++;
                        forceCuCount = atoi(argv[argIdx]);
                        fprintf(stderr, "Using up to %d workgroups\n", forceCuCount);
                    }
                }
                else if (_strnicmp(argv[argIdx], "c2c", 3) == 0)
                {
                    testType = CoreToCore;
                    fprintf(stderr, "Testing latency with global atomics across CU count\n");
                }
                else if (_strnicmp(argv[argIdx], "link", 4) == 0)
                {
                    testType = LinkBandwidth;
                    fprintf(stderr, "Testing host <-> GPU link bandwidth\n");
                    if (!chase_iterations_set) chase_iterations = 30000000;
                }
                else if (_strnicmp(argv[argIdx], "instructionrate", 15) == 0)
                {
                    testType = InstructionRate;
                    fprintf(stderr, "Testing instruction rate\n");
                    if (!chase_iterations_set) chase_iterations = 1000;
                    if (!local_size_set && !thread_count_set) {
                        local_size = 256;
                        thread_count = 32768;
                        fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count);
                    }
                }
                else if (_strnicmp(argv[argIdx], "tmu", 3) == 0)
                {
                    testType = TextureThroughput;
                    fprintf(stderr, "Testing TMUs\n");
                }
                else if (_strnicmp(argv[argIdx], "divergence", 10) == 0)
                {
                    testType = Divergence;
                    fprintf(stderr, "Testing compute throughput with varying numbers of consecutive threads doing the same op\n");
                    if (!local_size_set && !thread_count_set) {
                        local_size = 256;
                        thread_count = 32768;
                        fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count);
                    }
                }
                else if (_strnicmp(argv[argIdx], "partition", 9) == 0)
                {
                    testType = Partition;
                    fprintf(stderr, "Testing execution unit partitioning. Make sure wave size is set!\n");
                }
                else {
                    fprintf(stderr, "I'm so confused. Unknown test type %s\n", argv[argIdx]);
                }
            }
        }
    }

    if (argc == 1)
    {
        fprintf(stderr, "Usage:\n\t[-test <latency/constantlatency/globalatomic/localatomic/bw>]\n\t[-platform <platform id>]\n\t[-device <device id>]\n");
        fprintf(stderr, "\t[-threads <opencl thread count>]\n\t[-localsize <opencl workgroup size>]\n\t[-bwskip <workgroup spacing>]\n");
        fprintf(stderr, "Number of threads (OpenCL global work size) must be divisible by local work size\n");
    }

    fprintf(stderr, "Using %d threads with local size %d\n", thread_count, local_size);
#pragma region opencl_overhead
    // Create an OpenCL context
    cl_context context = get_context_from_user(platform_index, device_index);
    if (context == NULL) exit(1);

    // Load kernel
    cl_program program = build_program(context, "kernel.cl", NULL);
    if (saveprogram) write_program(program, "kernel");

    // Create a command queue
    cl_command_queue command_queue = clCreateCommandQueue(context, selected_device_id, 0, &ret);
    fprintf(stderr, "clCreateCommandQueue returned %d\n", ret);

    cl_kernel c2c_atomic_latency_test_kernel = clCreateKernel(program, "c2c_atomic_exec_latency_test", &ret);
    cl_kernel dummy_add_kernel = clCreateKernel(program, "dummy_add", &ret);
    cl_kernel local_bw_chase_kernel = clCreateKernel(program, "local_chase_kernel", &ret);
#pragma endregion opencl_overhead

    max_global_test_size = get_max_buffer_size();

    if (testType == GlobalAtomicLatency)
    {
        cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL);
        cl_kernel atomic_latency_test_kernel = clCreateKernel(prog, "atomic_exec_latency_test", &ret);
        if (saveprogram) write_program(prog, "atomic_exec_latency_test");

        chase_iterations = 200000;
        uint32_t elapsed_ms = 0, target_ms = 2000;
        while (elapsed_ms < target_ms / 2) {
            result = int_atomic_latency_test(context, command_queue, atomic_latency_test_kernel, chase_iterations, false, &elapsed_ms);
            fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result);
            chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);
        }
        printf("global atomic latency: %f\n", result);
        clReleaseKernel(atomic_latency_test_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == LocalAtomicLatency)
    {
        cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL);
        cl_kernel local_atomic_latency_test_kernel = clCreateKernel(prog, "local_atomic_latency_test", &ret);
        if (saveprogram) write_program(prog, "local_atomic_latency_test");

        chase_iterations = 500000;
        uint32_t elapsed_ms = 0, target_ms = 2000;
        while (elapsed_ms < target_ms / 2) {
            result = int_atomic_latency_test(context, command_queue, local_atomic_latency_test_kernel, chase_iterations, true, &elapsed_ms);
            fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result);
            chase_iterations = scale_iterations_to_target(chase_iterations, (float)elapsed_ms, (float)target_ms);
        }
        printf("local atomic latency: %f\n", result);
        clReleaseKernel(local_atomic_latency_test_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == GlobalAtomicAdd)
    {
        cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL);
        cl_kernel global_atomic_add_kernel = clCreateKernel(prog, "atomic_add_test", &ret);
        if (saveprogram) write_program(prog, "atomic_exec_latency_test");
        result = int_atomic_add_test(context, command_queue, global_atomic_add_kernel, thread_count, local_size);
        fprintf(stderr, "Global atomic INT32 adds: %f GOPS\n", result);
    }
    else if (testType == LocalAtomicAdd)
    {
        cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL);
        cl_kernel local_atomic_add_kernel = clCreateKernel(prog, "local_atomic_add_test", &ret);
        if (saveprogram) write_program(prog, "local_atomic_latency_test");
        result = int_atomic_add_test(context, command_queue, local_atomic_add_kernel, thread_count, local_size);
        fprintf(stderr, "Local atomic INT32 adds: %f GOPS\n", result);
    }
    else if (testType == VectorMemLatency || testType == ScalarMemLatency)
    {
        cl_program prog;
        cl_kernel globalMemLatencyKernel;
        if (testType == ScalarMemLatency) 
        {
            prog = build_program(context, "scalar_unrolled_latency_test.cl", NULL);
            globalMemLatencyKernel = clCreateKernel(prog, "scalar_unrolled_latency_test", &ret);
            if (saveprogram) write_program(prog, "scalar_unrolled_latency_test");
        }
        else // Vector mem latency
        {
            prog = build_program(context, "unrolled_latency_test.cl", NULL);
            globalMemLatencyKernel = clCreateKernel(prog, "unrolled_latency_test", &ret);
            if (saveprogram) write_program(prog, "unrolled_latency_test");
        }

        fprintf(stderr, "Doing %d K p-chase iterations with stride %d over %d KiB region\n", chase_iterations / 1000, stride, list_size * 4 / 1024);
        printf("\nSattolo, global memory latency (up to %llu K) unroll:\n", max_global_test_size / 1024);

        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
            if (max_global_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {
                printf("%d K would exceed device's max buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_global_test_size / 1024);
                break;
            }
            result = latency_test(context, command_queue, 
                globalMemLatencyKernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);
            printf("%d,%f\n", default_test_sizes[size_idx], result);
            if (result == 0) {
                printf("Something went wrong, not testing anything bigger.\n");
                break;
            }
        }

        clReleaseKernel(globalMemLatencyKernel);
        clReleaseProgram(prog);
    }
    else if (testType == MemDivergence) {
        cl_program vecProg, texProg;
        cl_kernel vecKernel, texKernel;
        fprintf(stderr, "Testing mem divergence with localsize %d, test size %d KB\n", local_size, sizeKb);

        // vector
        vecProg = build_program(context, "unrolled_latency_test.cl", NULL);
        if (saveprogram) write_program(vecProg, "vector_unrolled_latency_test");
        vecKernel = clCreateKernel(vecProg, "unrolled_latency_test", &ret);

        texProg = build_program(context, "tex_latency_test.cl", NULL);
        texKernel = clCreateKernel(texProg, "tex_latency_test", &ret);
        if (saveprogram) write_program(texProg, "tex_latency_test");

        float* memDivergenceResults = (float*)malloc(sizeof(float) * local_size * 2);
        for (int threadCount = 1; threadCount <= local_size; threadCount++) {
            float vecResult = latency_test(context, command_queue, vecKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), false, threadCount, threadCount, 1, stride, NULL);
            memDivergenceResults[threadCount * 2] = vecResult;

            float texResult = tex_latency_test(context, command_queue, texKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), threadCount, threadCount, 1);
            memDivergenceResults[threadCount * 2 + 1] = texResult;

            fprintf(stderr, "%d threads: %f vec, %f tex\n", threadCount, vecResult, texResult);
        }

        for (int threadCount = 1; threadCount <= local_size; threadCount++) {
            printf("%d,%f,%f\n", threadCount, memDivergenceResults[threadCount * 2], memDivergenceResults[threadCount * 2 + 1]);
        }

        clReleaseKernel(texKernel);
        clReleaseKernel(vecKernel);
        clReleaseProgram(texProg);
        clReleaseProgram(vecProg);
        free(memDivergenceResults);
    }
    else if (testType == LocalMemCapacity)
    {
        char build_options[128];
        const char* local_mem_define_prefix = "-D LATENCY_LOCAL_MEM_SIZE=";
        memset(build_options, 0, 128);
        memcpy(build_options, local_mem_define_prefix, 26);
        snprintf(build_options + 26, 128 - 26, "%u", 256 * local_mem_size_kb);
        cl_program program = build_program(context, "local_mem_latency_kernel.cl", build_options);
        cl_kernel local_mem_capacity_kernel = clCreateKernel(program, "unrolled_latency_test_localmem", &ret);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Could not create local mem capacity testing kernel\n");
            exit(0);
        }

        if (saveprogram) write_program(program, "local_mem_latency_kernel");

        fprintf(stderr, "Testing local memory capacity with %u KB of local mem per WG, up to %u WGs\n", local_mem_size_kb, group_count);
        printf("Groups,Local Mem Capacity,Latency\n");
        for (int groups = 1; groups <= group_count; groups++) {
            result = latency_test(context, command_queue, 
                local_mem_capacity_kernel, 
                256 * sizeKb, 
                (uint32_t)scale_iterations(sizeKb, chase_iterations), 
                true, 
                groups, 
                1, 
                1, 
                64, 
                NULL);
            printf("%d,%d,%f\n", groups, groups* local_mem_size_kb, result);
        }

        clReleaseKernel(local_mem_capacity_kernel);
        clReleaseProgram(program);
    }
    else if (testType == ConstantMemLatency)
    {
        cl_program prog = build_program(context, "constant_unrolled_latency_test.cl", NULL);
        cl_kernel constant_kernel = clCreateKernel(prog, "constant_unrolled_latency_test", &ret);
        if (saveprogram) write_program(prog, "constant_unrolled_latency_test");
        cl_ulong max_constant_test_size = get_max_constant_buffer_size();
        printf("\nSattolo, constant memory (up to %llu K), no-unroll:\n", max_constant_test_size / 1024);

        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
            if (max_constant_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {
                printf("%d K would exceed device's max constant buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_constant_test_size / 1024);
                break;
            }
            result = latency_test(context, command_queue, constant_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);
            printf("%d,%f\n", default_test_sizes[size_idx], result);
            if (result == 0) {
                printf("Something went wrong, not testing anything bigger.\n");
                break;
            }
        }

        clReleaseKernel(constant_kernel);
        clReleaseProgram(program);
    }
    else if (testType == TexMemLatency)
    {
        cl_program prog = build_program(context, "tex_latency_test.cl", NULL);
        cl_kernel tex_latency_kernel = clCreateKernel(prog, "tex_latency_test", &ret);
        if (saveprogram) write_program(prog, "tex_latency_test");
        cl_ulong max_tex_test_size = get_max_tex_buffer_size();
        for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
            if (default_test_sizes[size_idx] * 1024 > max_tex_test_size) {
                printf("%d K would exceed device's texture buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_tex_test_size / 1024);
                break;
            }

            result = tex_latency_test(context, command_queue, tex_latency_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), 
                thread_count, local_size, wave);
            printf("%d,%f\n", default_test_sizes[size_idx], result);
            if (result == 0) {
                printf("Something went wrong, not testing anything bigger.\n");
                break;
            }
        }

        clReleaseKernel(tex_latency_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == LocalMemLatency)
    {
        cl_program prog = build_program(context, "local_unrolled_latency_test.cl", NULL);
        cl_kernel local_kernel = clCreateKernel(prog, "local_unrolled_latency_test", &ret);
        if (saveprogram) write_program(prog, "local_unrolled_latency_test");

        uint32_t elapsed_ms = 0, target_ms = 2000;
        chase_iterations = 50000;
        while (elapsed_ms < target_ms / 2) {
            result = latency_test(context, command_queue, local_kernel, 1024, chase_iterations, false, thread_count, local_size, wave, stride, &elapsed_ms);
            fprintf(stderr, "%u iterations, %u ms -> %f ns\n", chase_iterations, elapsed_ms, result);
            chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);
        }
        printf("Local mem latency: %f\n", result);

        clReleaseKernel(local_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == GlobalMemBandwidth)
    {
        cl_program prog = build_program(context, "sum_bw_test.cl", NULL);
        cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret);
        if (saveprogram) write_program(prog, "sum_bw_test");
        fprintf(stderr, "Using %u threads, %u local size, %u base iterations\n", thread_count, local_size, chase_iterations);
        printf("\nMemory bandwidth (up to %llu K):\n", max_global_test_size / 1024);

        if (!sizeKb) {
            for (int size_idx = 0; size_idx < sizeof(default_bw_test_sizes) / sizeof(unsigned long long); size_idx++) {
                uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;
                if ((max_global_test_size / 1024) < testSizeKb) {
                    printf("%llu K would exceed device's max buffer size of %llu K, stopping here.\n", testSizeKb, max_global_test_size / 1024);
                    break;
                }

                result = bw_test(context,
                    command_queue,
                    bw_kernel, 256 * testSizeKb,
                    thread_count,
                    local_size,
                    skip,
                    scale_bw_iterations(chase_iterations, testSizeKb));

                printf("%llu,%f\n", testSizeKb, result);
                if (result == 0) {
                    printf("Something went wrong, not testing anything bigger.\n");
                    break;
                }
            }
        }
        else {
            result = bw_test(context,
                command_queue,
                bw_kernel, 256 * sizeKb,
                thread_count,
                local_size,
                skip,
                scale_bw_iterations(chase_iterations, sizeKb));

            printf("%lu,%f\n", sizeKb, result);
            if (result == 0) {
                printf("Something went wrong, not testing anything bigger.\n");
            }
        }

        clReleaseKernel(bw_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == LocalMemBandwidth || 
        testType == LocalMem64Bandwidth || 
        testType == BufferBandwidth || 
        testType == LoadStoreBandwidth ||
        testType == TextureThroughput ||
        testType == LocalMemFloat4Bandwidth ||
        testType == MixedFloat4Bandwidth)
    {
        cl_program prog;
        cl_kernel local_bw_kernel = NULL, local_64_bw_kernel = NULL, local_float4_bw_kernel = NULL, buffer_bw_kernel = NULL, tex_bw_kernel = NULL, loadstore_bw_kernel = NULL;
        cl_kernel mixed_bw_kernel = NULL;
        if (testType == LocalMemBandwidth)
        {
            prog = build_program(context, "local_bw_test.cl", NULL);
            local_bw_kernel = clCreateKernel(prog, "local_bw_test", &ret);
            if (saveprogram) write_program(prog, "local_bw_test");
        }
        else if (testType == LocalMem64Bandwidth) {
            prog = build_program(context, "local_64_bw_test.cl", NULL);
            local_64_bw_kernel = clCreateKernel(prog, "local_64_bw_test", &ret);
            if (saveprogram) write_program(prog, "local_64_bw_test");
        }
        else if (testType == LocalMemFloat4Bandwidth) {
            prog = build_program(context, "local_float4_bw_test.cl", NULL);
            local_float4_bw_kernel = clCreateKernel(prog, "local_float4_bw_test", &ret);
            if (saveprogram) write_program(prog, "local_float4_bw_test");
        }
        else if (testType == BufferBandwidth) {
            prog = build_program(context, "buffer_bw_test.cl", NULL);
            buffer_bw_kernel = clCreateKernel(prog, "buffer_bw_test", &ret);
            if (saveprogram) write_program(prog, "buffer_bw_test");
        }
        else if (testType == LoadStoreBandwidth)
        {
            prog = build_program(context, "ldst_bw_test.cl", NULL);
            loadstore_bw_kernel = clCreateKernel(prog, "ldst_bw_test", &ret);
            if (saveprogram) write_program(prog, "ldst_bw_test");
        }
        else if (testType == MixedFloat4Bandwidth)
        {
            prog = build_program(context, "local_float4_bw_test.cl", NULL);
            mixed_bw_kernel = clCreateKernel(prog, "mixed_float4_bw_test", NULL);
            if (saveprogram) write_program(prog, "mixed_float4_bw_test");
        }
        else { // tex throughput
            prog = build_program(context, "tex_bw_test.cl", NULL);
            tex_bw_kernel = clCreateKernel(prog, "tex_bw_test", &ret);
            if (saveprogram) write_program(prog, "tex_bw_test");
        }

        uint32_t thread_low = 1024, thread_high = 1048576*4;
        if (!thread_count_set) thread_count = thread_low;
        float max_bw = 0;

        while (true) {
            int64_t elapsed_ms = 0, target_ms = 1500;
            if (!chase_iterations_set) chase_iterations = 500000;
            while (elapsed_ms < target_ms / 2)
            {
                if (testType == LocalMemBandwidth) {
                    fprintf(stderr, "Testing local mem bw\n");
                    result = local_bw_test(context, command_queue, local_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == LocalMem64Bandwidth) {
                    fprintf(stderr, "Testing local mem bw with 64-bit loads\n");
                    result = local_64_bw_test(context, command_queue, local_64_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == LocalMemFloat4Bandwidth) {
                    fprintf(stderr, "Testing local mem bw with float4 loads\n");
                    result = local_bw_test(context, command_queue, local_float4_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == MixedFloat4Bandwidth) {
                    fprintf(stderr, "Testing mixed local/global bw with float4 loads\n");
                    result = local_bw_test(context, command_queue, mixed_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == BufferBandwidth)
                {
                    fprintf(stderr, "Testing buffer bw\n");
                    result = buffer_bw_test(context, command_queue, buffer_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == LoadStoreBandwidth)
                {
                    fprintf(stderr, "Testing global load bandwidth\n");
                    result = local_bw_test(context, command_queue, loadstore_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
                }
                else if (testType == TextureThroughput)
                {
                    fprintf(stderr, "Testing texture throughput\n");
                    result = tex_bw_test(context,
                        command_queue,
                        tex_bw_kernel,
                        256, // width
                        256, // height
                        thread_count,
                        local_size,
                        0,
                        chase_iterations,
                        &elapsed_ms);
                }

                fprintf(stderr, "%u threads, %u local size, %u iterations ==> %f GB/s, elapsed time %lld ms\n",
                    thread_count, local_size, chase_iterations, result, elapsed_ms);
                if (elapsed_ms < 25) chase_iterations *= 2;
                else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));
                if (result == 0)
                {
                    fprintf(stderr, "Run failed\n");
                    break;
                }

                if (chase_iterations_set) break;
            }
                    
            if (result > max_bw) max_bw = result;

            if (thread_count_set) break;
            thread_count *= 2;
            if (thread_count > thread_high) break;
        }

        printf("Bandwidth: %f GB/s\n", max_bw);
    }
    else if (testType == LocalMemChaseBandwidth)
    {
        int thread_scan_done = 0;
        uint32_t thread_low = 256, thread_high = 524288 * 4;
        fprintf(stderr, "Testing local memory bandwidth using pointer chasing. Ensure wave size is set correctly with -wave\n");

        if (!thread_count_set) thread_count = thread_low;

        while (!thread_scan_done) {
            // ignore chase iterations and auto manage it
            int64_t elapsed_ms = 0, target_ms = 1500;
            chase_iterations = 500000;

            if (thread_count_set) thread_scan_done = 0;
            else
            {
                thread_count *= 2;
                if (thread_count > thread_high) break;
            }

            while (elapsed_ms < target_ms / 2)
            {
                result = local_chase_bw_test(context, command_queue, local_bw_chase_kernel, thread_count, local_size, chase_iterations, wave, &elapsed_ms);
                fprintf(stderr, "%u threads, %u local size, %u wave, %u iterations ==> %f GB/s, elapsed time %lld ms\n",
                    thread_count, local_size, wave, chase_iterations, result, elapsed_ms);
                if (elapsed_ms < 25) chase_iterations *= 2;
                else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));
                if (result == 0)
                {
                    fprintf(stderr, "Run failed\n");
                    break;
                }
            }
        }

        printf("Local memory bandwidth: %f GB/s\n", result);
    }
    else if (testType == MemBandwidthWorkgroupScaling)
    {
        cl_program prog = build_program(context, "sum_bw_test.cl", NULL);
        cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret);
        if (saveprogram) write_program(prog, "sum_bw_test");
        uint32_t testSizeCount = sizeof(default_bw_test_sizes) / sizeof(unsigned long long);
        cl_uint cuCount = forceCuCount ? forceCuCount : getCuCount();

        fprintf(stderr, "Device has %u compute units\n", cuCount);

        float* scalingResults = (float*)malloc(sizeof(float) * cuCount * testSizeCount);
        for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
        {
            if (!sizeKb) {
                for (int size_idx = 0; size_idx < testSizeCount; size_idx++)
                {
                    uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;
                    fprintf(stderr, "Testing size %llu KB, %u workgroups\n", testSizeKb, workgroupCount);
                    if ((max_global_test_size / 1024) < testSizeKb) {
                        printf("%llu K would exceed device's max buffer size of %llu K\n", testSizeKb, max_global_test_size / 1024);
                        scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = 0;
                        continue;
                    }

                    result = bw_test(context,
                        command_queue,
                        bw_kernel, 256 * testSizeKb,
                        local_size * workgroupCount,
                        local_size,
                        skip,
                        scale_bw_iterations(chase_iterations, testSizeKb));

                    scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = result;
                    fprintf(stderr, "%u workgroups, %llu KB = %f GB/s\n", workgroupCount, testSizeKb, result);
                }
            }
            else {
                fprintf(stderr, "Testing size %d KB, %u workgroups\n", sizeKb, workgroupCount);
                result = bw_test(context,
                    command_queue,
                    bw_kernel, 256 * sizeKb,
                    local_size * workgroupCount,
                    local_size,
                    skip,
                    scale_bw_iterations(chase_iterations, sizeKb));
                scalingResults[workgroupCount - 1] = result;
                fprintf(stderr, "%u workgroups, %lu KB = %f GB/s\n", workgroupCount, sizeKb, result);
            }
        }

        if (!sizeKb) {
            for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
            {
                printf(",%u", workgroupCount);
            }
            printf("\n");

            for (int size_idx = 0; size_idx < testSizeCount; size_idx++)
            {
                printf("%llu", default_bw_test_sizes[size_idx] / 1024);
                for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
                {
                    printf(",%f", scalingResults[(workgroupCount - 1) * testSizeCount + size_idx]);
                }

                printf("\n");
            }
        }
        else {
            printf("For %d KB:\n", sizeKb);
            for (int workgroupIdx = 0; workgroupIdx < cuCount; workgroupIdx++)
            {
                printf("%d,%f\n", workgroupIdx + 1, scalingResults[workgroupIdx]);
            }

            printf("\n");
        }

        free(scalingResults);
        clReleaseKernel(bw_kernel);
        clReleaseProgram(prog);
    }
    else if (testType == CoreToCore)
    {
        c2c_atomic_latency_test(context, command_queue, c2c_atomic_latency_test_kernel, chase_iterations);
     }
    else if (testType == LinkBandwidth)
    {
        link_bw_test(context, command_queue, dummy_add_kernel, chase_iterations);
    }
    else if (testType == InstructionRate)
    {
        instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, forcefp16, forcefp64);
    }
    else if (testType == Divergence)
    {
        int current_wave = 1;
        int max_wave = 512;
        printf("Contiguous Thread Block Size,FP32 GOPs\n");
        while (current_wave <= max_wave)
        {
            float gops = run_divergence_rate_test(context, command_queue, thread_count, local_size, current_wave, NULL);
            printf("%d,%f\n", current_wave, gops);
            current_wave *= 2;
        }
    }
    else if (testType == Partition)
    {
        // function and its associated kernel serve two purposes
        int pattern4[] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
        float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern4);
        printf("Throughput: %f\n", result);

	int patterns[] = { 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0 };
        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, patterns);
        printf("Throughput: %f\n", result);

        int pattern2[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern2);
        printf("Throughput: %f\n", result);

        int consec_pattern[] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0 };
        result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, consec_pattern);
        printf("Throughput: %f\n", result);
    }

    //printf("If you didn't run this through cmd, now you can copy the results. And press ctrl+c to close");
    //scanf("\n");

    // Clean up
    cleanup:
    ret = clFlush(command_queue);
    ret = clFinish(command_queue);
    ret = clReleaseProgram(program);
    ret = clReleaseCommandQueue(command_queue);
    ret = clReleaseContext(context);
    return 0;
}

/// <summary>
/// Heuristic to make sure test runs for enough time but not too long
/// </summary>
/// <param name="size_kb">Region size</param>
/// <param name="iterations">base iterations</param>
/// <returns>scaled iterations</returns>
uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) {
    return 10 * iterations / pow(size_kb, 1.0 / 4.0);
}

#define INT_EXEC_INPUT_SIZE 16
float int_exec_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations)
{
    cl_int ret;
    cl_int result = 0;
    size_t global_item_size = 1;
    size_t local_item_size = 1;
    float latency;
    uint32_t time_diff_ms;
    uint32_t A[INT_EXEC_INPUT_SIZE];

    for (int i = 0; i < INT_EXEC_INPUT_SIZE; i++) A[i] = i;

    cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), NULL, &ret);
    cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);
    ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), A, 0, NULL, NULL);
    ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
    clFinish(command_queue);
    clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
    clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);

    start_timing();
    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
        latency = 0;
        goto cleanup;
    }
    clFinish(command_queue);
    time_diff_ms = end_timing();
    latency = 1e6 * (float)time_diff_ms / (float)(iterations * 12);

cleanup:
    clFlush(command_queue);
    clFinish(command_queue);
    clReleaseMemObject(a_mem_obj);
    clReleaseMemObject(result_obj);
    return latency;
}

uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb)
{
    if (size_kb < 4096) return base_iterations;
    else return base_iterations / 2;
}


================================================
FILE: GpuMemLatency/opencltest.h
================================================
#pragma once

#ifndef opencltestheader
#define opencltestheader
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include "../Common/timing.h"

#define false 0
#define true 1

#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifndef __APPLE__
#include <CL/cl.h>
#else
#include <OpenCL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)

#define CACHELINE_SIZE 64
#define TARGET_TIME_MS 2000

#ifndef _MSC_VER
#define _strnicmp strncmp
#endif
extern cl_device_id selected_device_id;
extern cl_platform_id selected_platform_id;
extern cl_ulong max_global_test_size;
extern int saveprogram;
cl_context get_context_from_user(int platform_index, int device_index);
cl_program build_program(cl_context context, const char* fname, const char *params);
void write_program(cl_program program, const char *name);
uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms);
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment);
cl_uint getCuCount();
size_t getMaxWorkgroupSize();
cl_ulong get_max_constant_buffer_size();
cl_ulong get_max_buffer_size();
cl_ulong get_max_tex_buffer_size();
cl_ulong get_max_2d_tex_width();
cl_ulong get_max_2d_tex_height();

float int_atomic_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations,
    short local,
    uint32_t *time_ms);
float int_atomic_add_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    size_t threads,
    size_t localsize);
float latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t list_size,
    uint32_t chase_iterations,
    short uniform,
    int threads,
    int local_size,
    int wave,
    int stride,
    uint32_t *elapsed_ms);
float tex_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t list_size,
    uint32_t chase_iterations,
    int threads,
    int local_size,
    int wave_size);
float bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint64_t list_size,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t skip,
    uint32_t chase_iterations);
float tex_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint64_t width,
    uint64_t height,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t randomize,
    uint32_t chase_iterations,
    int64_t *time_ms);
float local_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t *time_ms);
float local_chase_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    uint32_t wave_size,
    int64_t* time_ms);
float local_64_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t* time_ms);
float buffer_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int64_t* time_ms);
void link_bw_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations);
float c2c_atomic_latency_test(cl_context context,
    cl_command_queue command_queue,
    cl_kernel kernel,
    uint32_t iterations);

float instruction_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t chase_iterations,
    int forcefp16,
    int forcefp64);

float run_divergence_rate_test(cl_context context,
    cl_command_queue command_queue,
    uint32_t thread_count,
    uint32_t local_size,
    uint32_t wave,
    int *pattern);
#endif


================================================
FILE: GpuMemLatency/opencltest.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30503.244
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencltest", "opencltest.vcxproj", "{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32
		{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71}
	EndGlobalSection
EndGlobal


================================================
FILE: GpuMemLatency/opencltest.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{fa51d7f4-f6e0-4cb5-9cdd-ad39a3519f78}</ProjectGuid>
    <RootNamespace>opencltest</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\common\timing.c" />
    <ClCompile Include="atomic_test.c" />
    <ClCompile Include="bw_test.c" />
    <ClCompile Include="common.c" />
    <ClCompile Include="instruction_rate.c" />
    <ClCompile Include="texturetest.c" />
    <CopyFileToFolders Include="instruction_rate_fp16_kernel.cl">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>CppCode</FileType>
    </CopyFileToFolders>
    <ClCompile Include="latency_test.c" />
    <ClCompile Include="opencltest.c" />
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="kernel.cl">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\common\timing.h" />
    <ClInclude Include="opencltest.h" />
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="instruction_rate_kernel.cl">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="instruction_rate_fp64_kernel.cl">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="local_mem_latency_kernel.cl">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="kernels\atomic_exec_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\buffer_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\c2c_atomic_exec_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\constant_unrolled_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\local_64_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\local_atomic_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\local_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\ldst_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\local_float4_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\local_unrolled_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\scalar_unrolled_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <None Include="kernels\sum_bw_test.cl" />
    <CopyFileToFolders Include="kernels\tex_bw_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\tex_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
    <CopyFileToFolders Include="kernels\unrolled_latency_test.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: GpuMemLatency/opencltest.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="..\common\timing.c" />
    <ClCompile Include="atomic_test.c" />
    <ClCompile Include="bw_test.c" />
    <ClCompile Include="common.c" />
    <ClCompile Include="instruction_rate.c" />
    <ClCompile Include="texturetest.c" />
    <ClCompile Include="latency_test.c" />
    <ClCompile Include="opencltest.c" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\common\timing.h" />
    <ClInclude Include="opencltest.h" />
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="instruction_rate_fp16_kernel.cl" />
    <CopyFileToFolders Include="kernel.cl" />
    <CopyFileToFolders Include="instruction_rate_kernel.cl" />
    <CopyFileToFolders Include="instruction_rate_fp64_kernel.cl" />
    <CopyFileToFolders Include="local_mem_latency_kernel.cl" />
    <CopyFileToFolders Include="kernels\atomic_exec_latency_test.cl" />
    <CopyFileToFolders Include="kernels\buffer_bw_test.cl" />
    <CopyFileToFolders Include="kernels\c2c_atomic_exec_latency_test.cl" />
    <CopyFileToFolders Include="kernels\constant_unrolled_latency_test.cl" />
    <CopyFileToFolders Include="kernels\local_64_bw_test.cl" />
    <CopyFileToFolders Include="kernels\local_atomic_latency_test.cl" />
    <CopyFileToFolders Include="kernels\tex_bw_test.cl" />
    <CopyFileToFolders Include="kernels\local_bw_test.cl" />
    <CopyFileToFolders Include="kernels\local_float4_bw_test.cl" />
    <CopyFileToFolders Include="kernels\tex_latency_test.cl" />
  </ItemGroup>
  <ItemGroup>
    <None Include="kernels\local_unrolled_latency_test.cl" />
    <None Include="kernels\scalar_unrolled_latency_test.cl" />
    <None Include="kernels\sum_bw_test.cl" />
    <None Include="kernels\unrolled_latency_test.cl" />
  </ItemGroup>
</Project>

================================================
FILE: GpuMemLatency/texturetest.c
================================================
#include "opencltest.h"


================================================
FILE: InstructionRate/Makefile
================================================
include ../Common/arch_detect.mk

CFLAGS = -O3

all: $(TARGET)

amd64:
	$(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS)

aarch64:
	$(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)

riscv64:
	$(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS)

termux:
	clang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)

amd64_fusion:
	$(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS)

w64:
	$(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS)

ci: amd64 amd64_fusion aarch64 riscv64 w64

clean:
	rm -f *.o && find . -type f -executable -delete

.PHONY: all ci clean


================================================
FILE: InstructionRate/arm_instructionrate.c
================================================
#define  _GNU_SOURCE
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
#include <string.h>

extern uint64_t noptest(uint64_t iterations);
extern uint64_t clktest(uint64_t iterations);

extern uint64_t addtest(uint64_t iterations);
extern uint64_t eortest(uint64_t iterations);
extern uint64_t maddaddtest(uint64_t iterations);
extern uint64_t cmptest(uint64_t iterations);
extern uint64_t addmultest(uint64_t iterations);
extern uint64_t addmul21test(uint64_t iterations);
extern uint64_t mul32test(uint64_t iterations);
extern uint64_t mul64test(uint64_t iterations);
extern uint64_t latmul64test(uint64_t iterations);
extern uint64_t jmptest(uint64_t iterations);
extern uint64_t fusejmptest(uint64_t iterations);
extern uint64_t mixmuljmptest(uint64_t iterations);
extern uint64_t mixmuljmptest21(uint64_t iterations);
extern uint64_t mixaddjmptest(uint64_t iterations);
extern uint64_t mixaddjmp21test(uint64_t iterations);
extern uint64_t rortest(uint64_t iterations);
extern uint64_t mixmulrortest(uint64_t iterations);
extern uint64_t vecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t latvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t vecmul128test(uint64_t iterations, int arr[4]);
extern uint64_t latvecmul128test(uint64_t iterations, int arr[4]);
extern uint64_t mixvecaddmul128test(uint64_t iterations, int arr[4]);
extern uint64_t faddtest(uint64_t iterations, float arr[4]);
extern uint64_t latfaddtest(uint64_t iterations, float arr[4]);
extern uint64_t vecfadd128test(uint64_t iterations, float arr[4]);
extern uint64_t vecfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t latvecfadd128test(uint64_t iterations, float arr[4]);
extern uint64_t latvecfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfaddfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t vecfma128test(uint64_t iterations, float arr[4]);
extern uint64_t scalarfmatest(uint64_t iterations, float arr[4]);
extern uint64_t latvecfma128test(uint64_t iterations, float arr[4]);
extern uint64_t latscalarfmatest(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfaddfma128test(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfmulfma128test(uint64_t iterations, float arr[4]);

// see if SIMD pipeline shares ports with scalar ALU ones
extern uint64_t mixaddvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mix3to1addvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mix1to1addvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mixmulvecmultest(uint64_t iterations, int arr[4]);

// are vec int and vec fp on the same port?
extern uint64_t mixvecmulfmultest(uint64_t iterations, float farr[4], int iarr[4]);
extern uint64_t mixvecaddfaddtest(uint64_t iterations, float farr[4], int iarr[4]);

// where are the branch ports
extern uint64_t mixjmpvecaddtest(uint64_t iterations, int arr[4]);
extern uint64_t mixjmpvecmultest(uint64_t iterations, int arr[4]);

// load/store
extern uint64_t loadtest(uint64_t iterations, int arr[4]);
extern uint64_t mixloadstoretest(uint64_t iterations, int arr[4], int sink[4]);
extern uint64_t mix21loadstoretest(uint64_t iterations, int arr[4], int sink[4]);
extern uint64_t vecloadtest(uint64_t iterations, int arr[4]);
extern uint64_t vecstoretest(uint64_t iterations, int arr[4], int sink[4]);

// renamer tests
extern uint64_t indepmovtest(uint64_t iterations);
extern uint64_t depmovtest(uint64_t iterations);
extern uint64_t xorzerotest(uint64_t iterations);
extern uint64_t movzerotest(uint64_t iterations);
extern uint64_t subzerotest(uint64_t iterations);

// Is crypto separate
extern uint64_t aesetest(uint64_t iterations, int arr[4]);
extern uint64_t mixaesevecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t pmulltest(uint64_t iterations, int arr[4]);
extern uint64_t mixpmulladd128test(uint64_t iterations, int arr[4]);

float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };
int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };
int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };

float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t));
uint64_t vecadd128wrapper(uint64_t iterations);
uint64_t latvecadd128wrapper(uint64_t iterations);
uint64_t vecmul128wrapper(uint64_t iterations);
uint64_t latvecmul128wrapper(uint64_t iterations);
uint64_t mixvecaddmul128wrapper(uint64_t iterations);
uint64_t faddwrapper(uint64_t iterations);
uint64_t latfaddwrapper(uint64_t iterations);
uint64_t vecfadd128wrapper(uint64_t iterations);
uint64_t latvecfadd128wrapper(uint64_t iterations);
uint64_t vecfmul128wrapper(uint64_t iterations);
uint64_t latvecfmul128wrapper(uint64_t iterations);
uint64_t mixvecfaddfmul128wrapper(uint64_t iterations);
uint64_t mixaddvecadd128wrapper(uint64_t iterations);
uint64_t mix3to1addvecadd128wrapper(uint64_t iterations);
uint64_t mix1to1addvecadd128wrapper(uint64_t iterations);
uint64_t mixmulvecmulwrapper(uint64_t iterations);
uint64_t mixvecmulfmulwrapper(uint64_t iterations);
uint64_t mixvecaddfaddwrapper(uint64_t iterations);
uint64_t mixjmpvecaddwrapper(uint64_t iterations);
uint64_t mixjmpvecmulwrapper(uint64_t iterations);
uint64_t vecloadwrapper(uint64_t iterations);
uint64_t loadwrapper(uint64_t iterations);
uint64_t vecstorewrapper(uint64_t iterations);
uint64_t mixloadstorewrapper(uint64_t iterations);
uint64_t mix21loadstorewrapper(uint64_t iterations);
uint64_t vecfma128wrapper(uint64_t iterations);
uint64_t scalarfmawrapper(uint64_t iterations);
uint64_t latscalarfmawrapper(uint64_t iterations);
uint64_t mixvecfaddfma128wrapper(uint64_t iterations);
uint64_t mixvecfmulfma128wrapper(uint64_t iterations);
uint64_t latvecfma128wrapper(uint64_t iteration);
uint64_t aesetestwrapper(uint64_t iterations);
uint64_t mixaesevecadd128wrapper(uint64_t iterations);
uint64_t pmullwrapper(uint64_t iterations);
uint64_t mixpmulladd128wrapper(uint64_t iterations);

int threads = 0, hardaffinity = 0;
cpu_set_t cpuset;

int main(int argc, char *argv[]) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t iterations = 1500000000;
  uint64_t iterationsHigh = iterations * 5;
  uint64_t time_diff_ms;
  float latency, opsPerNs, clockSpeedGhz;
  
  if (argc > 1) {
    for (int argIdx = 1; argIdx < argc; argIdx++) {
      if (*(argv[argIdx]) == '-') {
        char *arg = argv[argIdx] + 1;
	if (strncmp(arg, "affinity", 8) == 0) {
	  argIdx++;
	  int targetCpu = atoi(argv[argIdx]);
          CPU_ZERO(&cpuset);
          CPU_SET(targetCpu, &cpuset);
          sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); 
	  fprintf(stderr, "Set affinity to %d\n", targetCpu);
	}
        else if (strncmp(arg, "hardaffinity", 12) == 0) {
          CPU_ZERO(&cpuset);
          CPU_SET(0, &cpuset);
          CPU_SET(1, &cpuset);
          sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); 
	  fprintf(stderr, "Set affinity 2,3\n"); 
          hardaffinity = 1;
        }
	else if (strncmp(arg, "threads", 7) == 0) {
	  argIdx++;
	  threads = atoi(argv[argIdx]);
	  fprintf(stderr, "Multithreading mode, %d threads\n", threads);
	}
	else if (strncmp(arg, "iter", 4) == 0) {
	  argIdx++;
	  int iterMul = atoi(argv[argIdx]);
	  iterations *= iterMul;
	  iterationsHigh *= iterMul;
	  fprintf(stderr, "Scaled iterations by %d\n", iterMul);
	}
      }
    }
  }

  // figure out clock speed
  gettimeofday(&startTv, &startTz);
  clktest(iterations);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  // clk speed should be 1/latency, assuming we got one add per clk, roughly
  clockSpeedGhz = 1/latency;
  printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz);

  printf("Nops per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
  printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
  printf("XORs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, eortest));
  printf("CMPs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmptest));
  
  printf("\n----Renamer Tests----\n");
  printf("Indepdent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));
  printf("Dependent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));
  printf("eor -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));
  printf("mov -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));
  printf("sub -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));

  printf("\n----ALU Pipe Layout Tests----\n");
  printf("Not taken jmps per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));
  printf("Jump fusion test> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest));
  printf("1:1 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest));
  printf("1:2 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21));
  printf("1:1 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest));
  printf("1:2 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test));
  printf("1:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));
  printf("2:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test));
  printf("ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest));
  printf("1:1 mixed mul/ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest));
  printf("1:3 madd:add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest));
  printf("32-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
  printf("64-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
  printf("64-bit multiply latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test));

  printf("\n----FP/ASIMD Crypto Tests----\n");
  printf("aese per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, aesetestwrapper));
  printf("1:1 aese and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaesevecadd128wrapper));
  printf("pmull per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, pmullwrapper));
  printf("1:1 pmull and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixpmulladd128wrapper));

  printf ("\n----FP/ASIMD Tests----\n");
  printf("scalar fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper));
  printf("128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper));
  printf("128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper));
  printf("128-bit vec int32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper));
  printf("128-bit vec fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper));
  printf("128-bit vec fp32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper));
  printf("128-bit vec fp32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper));
  printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper));
  printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper));
  printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper));
  printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper));
  printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper));
  printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper));
  printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper));
  printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper));
  printf("128-bit vec int32 add latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper));
  printf("128-bit vec int32 mul latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper));
  printf("Scalar FADD Latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper));
  printf("128-bit vector FADD latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper));
  printf("128-bit vector FMUL latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper));
  printf("128-bit vector FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper));
  printf("128-bit vector FMA latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper));
  printf("Scalar FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper));
  printf("Scalar FMA latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper));
  printf("1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper));
  printf("1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper)); 
  
  printf("\n----Load/Store Tests----\n");
  printf("128-bit vec loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper));
  printf("128-bit vec stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper));
  printf("64-bit loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper));
  printf("1:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper));
  printf("2:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper));


  return 0;
}

struct TestThreadData {
  uint64_t iterations;
  uint64_t (*testfunc)(uint64_t);
};

void *TestThread(void *param) {
  struct TestThreadData *testData = (struct TestThreadData *)param;
  if (hardaffinity) {
    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); 
  }

  testData->testfunc(testData->iterations);
  return NULL;
}

float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t)) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t time_diff_ms;
  float latency, opsPerNs;
  
  gettimeofday(&startTv, &startTz);
  if (threads == 0) testfunc(iterations);
  else {
    pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));
    struct TestThreadData *testData = (struct TestThreadData *)malloc(threads * sizeof(struct TestThreadData));
    for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
      testData[threadIdx].iterations = iterations;
      testData[threadIdx].testfunc = testfunc;
      pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);
    }

    for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
      pthread_join(testThreads[threadIdx], NULL);
    }

    free(testThreads);
    free(testData);
  }
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  opsPerNs = 1/latency;
  //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz);
  return opsPerNs / clockSpeedGhz;
}

uint64_t vecadd128wrapper(uint64_t iterations) {
  return vecadd128test(iterations, intTestArr);
}

uint64_t vecmul128wrapper(uint64_t iterations) {
  return vecmul128test(iterations, intTestArr);
}

uint64_t latvecadd128wrapper(uint64_t iterations) {
  return latvecadd128test(iterations, intTestArr);
}

uint64_t latvecmul128wrapper(uint64_t iterations) {
  return latvecmul128test(iterations, intTestArr);
}

uint64_t mixvecaddmul128wrapper(uint64_t iterations) {
  return mixvecaddmul128test(iterations, intTestArr);
}

uint64_t faddwrapper(uint64_t iterations) {
  return faddtest(iterations, fpTestArr);
}

uint64_t latfaddwrapper(uint64_t iterations) {
  return latfaddtest(iterations, fpTestArr);
}

uint64_t latvecfadd128wrapper(uint64_t iterations) {
  return latvecfadd128test(iterations, fpTestArr);
}

uint64_t latvecfmul128wrapper(uint64_t iterations) {
  return latvecfmul128test(iterations, fpTestArr);
}

uint64_t vecfadd128wrapper(uint64_t iterations) {
  return vecfadd128test(iterations, fpTestArr);
}

uint64_t vecfmul128wrapper(uint64_t iterations) {
  return vecfmul128test(iterations, fpTestArr);
}
uint64_t mixvecfaddfmul128wrapper(uint64_t iterations) {
  return mixvecfaddfmul128test(iterations, fpTestArr);
}

uint64_t mixaddvecadd128wrapper(uint64_t iterations) {
  return mixaddvecadd128test(iterations, intTestArr);
}

uint64_t mix3to1addvecadd128wrapper(uint64_t iterations) {
  return mix3to1addvecadd128test(iterations, intTestArr);
}

uint64_t mix1to1addvecadd128wrapper(uint64_t iterations) {
  return mix1to1addvecadd128test(iterations, intTestArr);
}

uint64_t mixmulvecmulwrapper(uint64_t iterations) {
  return mixmulvecmultest(iterations, intTestArr);
}

uint64_t mixvecmulfmulwrapper(uint64_t iterations) {
  return mixvecmulfmultest(iterations, fpTestArr, intTestArr);
}

uint64_t mixvecaddfaddwrapper(uint64_t iterations) {
  return mixvecaddfaddtest(iterations, fpTestArr, intTestArr);
}

uint64_t mixjmpvecaddwrapper(uint64_t iterations) {
  return mixjmpvecaddtest(iterations, intTestArr);
}

uint64_t mixjmpvecmulwrapper(uint64_t iterations) {
  return mixjmpvecmultest(iterations, intTestArr);
}

uint64_t vecloadwrapper(uint64_t iterations) {
  return vecloadtest(iterations, intTestArr);
}

uint64_t vecstorewrapper(uint64_t iterations) {
  return vecstoretest(iterations, intTestArr, sinkArr);
}

uint64_t loadwrapper(uint64_t iterations) {
  if (((uint64_t)intTestArr & 63) != 0) {
    printf("Warning - load may not be 64B aligned\n");
  }

  return loadtest(iterations, intTestArr);
}

uint64_t mixloadstorewrapper(uint64_t iterations) {
  return mixloadstoretest(iterations, intTestArr, sinkArr);
}

uint64_t mix21loadstorewrapper(uint64_t iterations) {
  return mix21loadstoretest(iterations, intTestArr, sinkArr);
}

uint64_t vecfma128wrapper(uint64_t iterations) {
  return vecfma128test(iterations, fpTestArr);
}

uint64_t scalarfmawrapper(uint64_t iterations) {
  return scalarfmatest(iterations, fpTestArr);
}

uint64_t latscalarfmawrapper(uint64_t iterations) {
  return latscalarfmatest(iterations, fpTestArr);
}

uint64_t latvecfma128wrapper(uint64_t iterations) {
  return latvecfma128test(iterations, fpTestArr);
}

uint64_t mixvecfmulfma128wrapper(uint64_t iterations) {
  return mixvecfmulfma128test(iterations, fpTestArr);
}

uint64_t mixvecfaddfma128wrapper(uint64_t iterations) {
  return mixvecfaddfma128test(iterations, fpTestArr);
}

uint64_t aesetestwrapper(uint64_t iterations) {
  return aesetest(iterations, intTestArr);
}

uint64_t mixaesevecadd128wrapper(uint64_t iterations) {
  return mixaesevecadd128test(iterations, intTestArr);
}

uint64_t pmullwrapper(uint64_t iterations) {
  return pmulltest(iterations, intTestArr);
}

uint64_t mixpmulladd128wrapper(uint64_t iterations) {
  return mixpmulladd128test(iterations, intTestArr);
}


================================================
FILE: InstructionRate/arm_instructionrate.s
================================================
.text

.global clktest
.global addtest
.global eortest
.global maddaddtest
.global cmptest
.global addmultest
.global addmul21test
.global mixaddjmp21test
.global mul32test
.global mul64test
.global latmul64test
.global noptest
.global fusejmptest
.global jmptest
.global mixmuljmptest
.global mixmuljmptest21
.global mixaddjmptest
.global rortest
.global mixmulrortest

.global _clktest
.global _addtest
.global _eortest
.global _maddaddtest
.global _cmptest
.global _addmultest
.global _addmul21test
.global _mixaddjmp21test
.global _mul32test
.global _mul64test
.global _latmul64test
.global _noptest
.global _fusejmptest
.global _jmptest
.global _mixmuljmptest
.global _mixmuljmptest21
.global _mixaddjmptest
.global _rortest
.global _mixmulrortest

.global vecadd128test
.global latvecadd128test
.global vecmul128test
.global latvecmul128test
.global mixvecaddmul128test
.global faddtest
.global latfaddtest
.global latfmultest
.global latvecfadd128test
.global latvecfmul128test
.global vecfadd128test
.global vecfmul128test
.global mixvecfaddfmul128test
.global mixaddvecadd128test
.global mix3to1addvecadd128test
.global mix1to1addvecadd128test
.global mixmulvecmultest
.global mixvecmulfmultest
.global mixvecaddfaddtest
.global mixjmpvecaddtest
.global mixjmpvecmultest
.global vecfma128test
.global latvecfma128test
.global scalarfmatest
.global latscalarfmatest
.global aesetest
.global mixaesevecadd128test
.global pmulltest
.global mixpmulladd128test

.global _vecadd128test
.global _latvecadd128test
.global _vecmul128test
.global _latvecmul128test
.global _mixvecaddmul128test
.global _faddtest
.global _latfaddtest
.global _latfmultest
.global _latvecfadd128test
.global _latvecfmul128test
.global _vecfadd128test
.global _vecfmul128test
.global _mixvecfaddfmul128test
.global _mixaddvecadd128test
.global _mix3to1addvecadd128test
.global _mix1to1addvecadd128test
.global _mixmulvecmultest
.global _mixvecmulfmultest
.global _mixvecaddfaddtest
.global _mixjmpvecaddtest
.global _mixjmpvecmultest
.global _vecfma128test
.global _latvecfma128test
.global _scalarfmatest
.global _latscalarfmatest

.global mixvecfaddfma128test
.global mixvecfmulfma128test
.global loadtest
.global mixloadstoretest
.global mix21loadstoretest
.global vecloadtest
.global vecstoretest

.global _mixvecfaddfma128test
.global _mixvecfmulfma128test
.global _loadtest
.global _mixloadstoretest
.global _mix21loadstoretest
.global _vecloadtest
.global _vecstoretest

//renamer tests
.global indepmovtest
.global depmovtest
.global xorzerotest
.global movzerotest
.global subzerotest

.global _indepmovtest
.global _depmovtest
.global _xorzerotest
.global _movzerotest
.global _subzerotest
.global _aesetest
.global _mixaesevecadd128test
.global _pmulltest
.global _mixpmulladd128test

.balign 4

/* x0 = arg = iteration count. all iteration counts must be divisible by 10 */
_clktest:
clktest:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  mov x15, 1
  mov x14, 20
  eor x13, x13, x13
clktest_loop:
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  add x13, x13, x15
  sub x0, x0, x14
  cbnz x0, clktest_loop
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret

_noptest:
noptest:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  mov x15, 1
  mov x14, 30
  eor x13, x13, x13
noptest_loop:
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop

  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop

  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  sub x0, x0, x14
  cbnz x0, noptest_loop
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret

_addtest:
addtest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 30
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
addtest_loop:
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add x9, x9, x15
  sub x0, x0, x14
  cbnz x0, addtest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret 

_maddaddtest:
maddaddtest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 20
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  mov x10, 2
  eor x9, x9, x9
  mov x8, 3
maddaddtest_loop:
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  madd x10, x8, x0, x15
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  madd x10, x8, x0, x15 
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  madd x10, x8, x0, x15  
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  madd x10, x8, x0, x15  
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  madd x10, x8, x0, x15  
  sub x0, x0, x14
  cbnz x0, maddaddtest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret 

_eortest:
eortest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 30
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
eortest_loop:
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  eor x13, x13, x15
  eor x12, x12, x15
  eor x11, x11, x15
  eor x10, x10, x15
  eor x9, x9, x15
  sub x0, x0, x14
  cbnz x0, eortest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_cmptest:
cmptest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 30
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
cmptest_loop:
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  cmp x13, x13
  cmp x12, x12
  cmp x11, x11
  cmp x10, x10
  cmp x9, x9 
  sub x0, x0, x14
  cbnz x0, cmptest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret 

_addmultest:
addmultest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 20
  mov x13, 2
  eor x12, x12, x12
  mov x11, 2
  eor x10, x10, x10
  mov x9, 2
  mov x8, 2
addmultest_loop:
  mul w13, w13, w15
  add x12, x12, x15
  mul w11, w11, w15
  add x10, x10, x15
  mul w9, w9, w15
  add x12, x12, x15
  mul w8, w8, w15
  add x10, x10, x15
  mul w13, w13, w15
  add x12, x12, x15
  mul w11, w11, w15
  add x10, x10, x15
  mul w9, w9, w15
  add x12, x12, x15
  mul w8, w8, w15
  add x10, x10, x15
  mul w13, w13, w15
  add x12, x12, x15
  mul w11, w11, w15
  add x10, x10, x15
  sub x0, x0, x14
  cbnz x0, addmultest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_addmul21test:
addmul21test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 24
  mov x13, 2
  eor x12, x12, x12
  mov x11, 2
  eor x10, x10, x10
  mov x9, 2
  mov x8, 2
addmul21test_loop:
  mul w13, w13, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w11, w11, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w9, w9, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w8, w8, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w13, w13, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w11, w11, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w9, w9, w15
  add x12, x12, x15
  add x10, x10, x15
  mul w8, w8, w15
  add x12, x12, x15
  add x10, x10, x15
  sub x0, x0, x14
  cmp x0, 0
  b.gt addmul21test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret 

_mul32test:
mul32test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 20
  mov x13, x15
  mov x12, x15
  mov x11, x15
  mov x10, x15
  mov x9, x15
  mov x8, x15
mul32test_loop:
  mul w13, w13, w15
  mul w12, w12, w15
  mul w11, w11, w15
  mul w10, w10, w15
  mul w9, w9, w15
  mul w8, w8, w15
  mul w13, w13, w15
  mul w12, w12, w15
  mul w11, w11, w15
  mul w10, w10, w15
  mul w9, w9, w15
  mul w8, w8, w15
  mul w13, w13, w15
  mul w12, w12, w15
  mul w11, w11, w15
  mul w10, w10, w15
  mul w9, w9, w15
  mul w8, w8, w15
  mul w13, w13, w15
  mul w12, w12, w15
  sub x0, x0, x14
  cbnz x0, mul32test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mul64test:
mul64test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 20
  mov x13, x15
  mov x12, x15
  mov x11, x15
  mov x10, x15
  mov x9, x15
  mov x8, x15
mul64test_loop:
  mul x13, x13, x15
  mul x12, x12, x15
  mul x11, x11, x15
  mul x10, x10, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x13, x13, x15
  mul x12, x12, x15
  mul x11, x11, x15
  mul x10, x10, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x13, x13, x15
  mul x12, x12, x15
  mul x11, x11, x15
  mul x10, x10, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x13, x13, x15
  mul x12, x12, x15
  sub x0, x0, x14
  cbnz x0, mul64test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_latmul64test:
latmul64test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 20
  mov x13, x15
latmul64test_loop:
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  mul x13, x13, x13
  sub x0, x0, x14
  cbnz x0, latmul64test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

/* needs an additional parameter passed in x1 - ptr to array of 4 floats */
_vecadd128test:
vecadd128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
vecadd128test_loop:
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, vecadd128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latvecadd128test:
latvecadd128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latvecadd128test_loop:
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  add v16.4s, v16.4s, v16.4s
  sub x0, x0, x14
  cbnz x0, latvecadd128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_faddtest:
faddtest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr s16, [x1]
  ldr s17, [x1, #0x4]
  ldr s18, [x1, #0x8]
  ldr s19, [x1, #0xC]
  ldr s20, [x1]
  ldr s21, [x1, #0x4]
faddtest_loop:
  fadd s16, s16, s16
  fadd s17, s17, s17
  fadd s18, s18, s18
  fadd s19, s19, s19
  fadd s20, s20, s20
  fadd s21, s21, s21
  fadd s16, s16, s16
  fadd s17, s17, s17
  fadd s18, s18, s18
  fadd s19, s19, s19
  fadd s20, s20, s20
  fadd s21, s21, s21
  fadd s16, s16, s16
  fadd s17, s17, s17
  fadd s18, s18, s18
  fadd s19, s19, s19
  fadd s20, s20, s20
  fadd s21, s21, s21
  fadd s16, s16, s16
  fadd s17, s17, s17
  sub x0, x0, x14
  cbnz x0, faddtest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latfaddtest:
latfaddtest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr s16, [x1]
latfaddtest_loop:
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  fadd s16, s16, s16
  sub x0, x0, x14
  cbnz x0, latfaddtest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latfmultest:
latfmultest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr s16, [x1]
latfmultest_loop:
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  fmul s16, s16, s16
  sub x0, x0, x14
  cbnz x0, latfmultest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latvecmul128test:
latvecmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latvecmul128test_loop:
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  mul v16.4s, v16.4s, v16.4s
  sub x0, x0, x14
  cbnz x0, latvecmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecmul128test:
vecmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
vecmul128test_loop:
  mul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  mul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  mul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  mul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, vecmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_scalarfmatest:
scalarfmatest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
  ldr q24, [x1]
  ldr q25, [x1]
scalarfmatest_loop:
  fmadd s16, s16, s16, s16
  fmadd s17, s17, s17, s17
  fmadd s18, s18, s18, s18
  fmadd s19, s19, s19, s19
  fmadd s20, s20, s20, s20
  fmadd s21, s21, s21, s21
  fmadd s22, s22, s22, s22
  fmadd s23, s23, s23, s23
  fmadd s24, s24, s24, s24
  fmadd s25, s25, s25, s25
  fmadd s16, s16, s16, s16
  fmadd s17, s17, s17, s17
  fmadd s18, s18, s18, s18
  fmadd s19, s19, s19, s19
  fmadd s20, s20, s20, s20
  fmadd s21, s21, s21, s21
  fmadd s22, s22, s22, s22
  fmadd s23, s23, s23, s23
  fmadd s24, s24, s24, s24
  fmadd s25, s25, s25, s25
  sub x0, x0, x14
  cbnz x0, scalarfmatest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latscalarfmatest:
latscalarfmatest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latscalarfmatest_loop:
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  fmadd s16, s16, s16, s16
  sub x0, x0, x14
  cbnz x0, latscalarfmatest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecfma128test:
vecfma128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
  ldr q24, [x1]
  ldr q25, [x1]
vecfma128test_loop:
  fmla v16.4s, v16.4s, v16.4s
  fmla v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fmla v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fmla v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fmla v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fmla v25.4s, v25.4s, v25.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fmla v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fmla v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fmla v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fmla v25.4s, v25.4s, v25.4s
  sub x0, x0, x14
  cbnz x0, vecfma128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixvecfmulfma128test:
mixvecfmulfma128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
  ldr q24, [x1]
  ldr q25, [x1]
mixvecfmulfma128test_loop:
  fmla v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fmul v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fmul v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fmul v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fmul v25.4s, v25.4s, v25.4s
  fmla v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fmul v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fmul v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fmul v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fmul v25.4s, v25.4s, v25.4s
  sub x0, x0, x14
  cbnz x0, mixvecfmulfma128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixvecfaddfma128test:
mixvecfaddfma128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
  ldr q24, [x1]
  ldr q25, [x1]
mixvecfaddfma128test_loop:
  fmla v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fadd v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fadd v25.4s, v25.4s, v25.4s
  fmla v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fmla v22.4s, v22.4s, v22.4s
  fadd v23.4s, v23.4s, v23.4s
  fmla v24.4s, v24.4s, v24.4s
  fadd v25.4s, v25.4s, v25.4s
  sub x0, x0, x14
  cbnz x0, mixvecfaddfma128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latvecfma128test:
latvecfma128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latvecfma128test_loop:
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  fmla v16.4s, v16.4s, v16.4s
  sub x0, x0, x14
  cbnz x0, latvecfma128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecfadd128test:
vecfadd128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
vecfadd128test_loop:
  fadd v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, vecfadd128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latvecfadd128test:
latvecfadd128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latvecfadd128test_loop:
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  fadd v16.4s, v16.4s, v16.4s
  sub x0, x0, x14
  cbnz x0, latvecfadd128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecfmul128test:
vecfmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
vecfmul128test_loop:
  fmul v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fmul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  fmul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fmul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  fmul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fmul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  fmul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, vecfmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_latvecfmul128test:
latvecfmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
latvecfmul128test_loop:
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  fmul v16.4s, v16.4s, v16.4s
  sub x0, x0, x14
  cbnz x0, latvecfmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixvecfaddfmul128test:
mixvecfaddfmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mixvecfaddfmul128test_loop:
  fmul v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fmul v22.4s, v22.4s, v22.4s
  fadd v23.4s, v23.4s, v23.4s
  fmul v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s
  fmul v22.4s, v22.4s, v22.4s
  fadd v23.4s, v23.4s, v23.4s
  fmul v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  sub x0, x0, x14
  cbnz x0, mixvecfaddfmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixvecaddmul128test:
mixvecaddmul128test:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mixvecaddmul128test_loop:
  mul v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  mul v22.4s, v22.4s, v22.4s
  add v23.4s, v23.4s, v23.4s
  mul v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  mul v22.4s, v22.4s, v22.4s
  add v23.4s, v23.4s, v23.4s
  mul v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  sub x0, x0, x14
  cbnz x0, mixvecaddmul128test_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixaddvecadd128test:
mixaddvecadd128test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 30
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
  eor x8, x8, x8
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mixaddvecadd128test_loop:
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add x10, x10, x15
  add v22.4s, v22.4s, v22.4s
  add v23.4s, v23.4s, v23.4s
  sub x0, x0, x14
  cbnz x0, mixaddvecadd128test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mix3to1addvecadd128test:
mix3to1addvecadd128test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 40
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
  eor x8, x8, x8
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mix3to1addvecadd128test_loop:
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v16.4s, v16.4s, v16.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v17.4s, v17.4s, v17.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v19.4s, v19.4s, v19.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v20.4s, v20.4s, v20.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v16.4s, v16.4s, v16.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v17.4s, v17.4s, v17.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v19.4s, v19.4s, v19.4s
  add x13, x13, x15
  add x12, x12, x15
  add x11, x11, x15
  add v20.4s, v20.4s, v20.4s
  sub x0, x0, x14
  cbnz x0, mix3to1addvecadd128test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mix1to1addvecadd128test:
mix1to1addvecadd128test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 40
  eor x13, x13, x13
  eor x12, x12, x12
  eor x11, x11, x11
  eor x10, x10, x10
  eor x9, x9, x9
  eor x8, x8, x8
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mix1to1addvecadd128test_loop:
  add x13, x13, x15
  add v16.4s, v16.4s, v16.4s
  add x12, x12, x15
  add v17.4s, v17.4s, v17.4s
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x10, x10, x15
  add v19.4s, v19.4s, v19.4s

  add x13, x13, x15
  add v16.4s, v16.4s, v16.4s
  add x12, x12, x15
  add v17.4s, v17.4s, v17.4s
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x10, x10, x15
  add v19.4s, v19.4s, v19.4s

  add x13, x13, x15
  add v16.4s, v16.4s, v16.4s
  add x12, x12, x15
  add v17.4s, v17.4s, v17.4s
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x10, x10, x15
  add v19.4s, v19.4s, v19.4s

  add x13, x13, x15
  add v16.4s, v16.4s, v16.4s
  add x12, x12, x15
  add v17.4s, v17.4s, v17.4s
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x10, x10, x15
  add v19.4s, v19.4s, v19.4s

  add x13, x13, x15
  add v16.4s, v16.4s, v16.4s
  add x12, x12, x15
  add v17.4s, v17.4s, v17.4s
  add x11, x11, x15
  add v18.4s, v18.4s, v18.4s
  add x10, x10, x15
  add v19.4s, v19.4s, v19.4s

  sub x0, x0, x14
  cbnz x0, mix1to1addvecadd128test_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixmulvecmultest:
mixmulvecmultest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 2
  mov x14, 20
  mov x13, x15
  mov x12, x15
  mov x11, x15
  mov x10, x15
  mov x9, x15
  mov x8, x15
  mov x7, x15
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1]
  ldr q22, [x1]
  ldr q23, [x1]
mixmulvecmultest_loop:
  mul w8, w8, w15
  mul v16.4s, v16.4s, v16.4s
  mul w9, w9, w15
  mul v17.4s, v17.4s, v17.4s
  mul w10, w10, w15
  mul v18.4s, v18.4s, v18.4s
  mul w11, w11, w15
  mul v19.4s, v19.4s, v19.4s
  mul w12, w12, w15
  mul v20.4s, v20.4s, v20.4s
  mul w8, w8, w15
  mul v16.4s, v16.4s, v16.4s
  mul w9, w9, w15
  mul v17.4s, v17.4s, v17.4s
  mul w10, w10, w15
  mul v18.4s, v18.4s, v18.4s
  mul w11, w11, w15
  mul v19.4s, v19.4s, v19.4s
  mul w12, w12, w15
  mul v20.4s, v20.4s, v20.4s
  sub x0, x0, x14
  cbnz x0, mixmulvecmultest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixvecmulfmultest:
mixvecmulfmultest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x2]
  ldr q18, [x1]
  ldr q19, [x2]
  ldr q20, [x1]
  ldr q21, [x2]
mixvecmulfmultest_loop:
  fmul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  fmul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  fmul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s
  fmul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, mixvecmulfmultest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixvecaddfaddtest:
mixvecaddfaddtest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x2]
  ldr q18, [x1]
  ldr q19, [x2]
  ldr q20, [x1]
  ldr q21, [x2]
mixvecaddfaddtest_loop:
  fadd v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s
  fadd v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  sub x0, x0, x14
  cbnz x0, mixvecaddfaddtest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixjmpvecaddtest:
mixjmpvecaddtest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 30
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
mixjmpvecaddtest_loop:
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v20.4s, v20.4s, v20.4s
  add v16.4s, v16.4s, v16.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v20.4s, v20.4s, v20.4s
  add v16.4s, v16.4s, v16.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  cbz x0, mixjmpvecaddtest_jellydonut
  sub x0, x0, x14
  cbnz x0, mixjmpvecaddtest_loop
mixjmpvecaddtest_jellydonut:
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixjmpvecmultest:
mixjmpvecmultest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
mixjmpvecmultest_loop:
  mul v16.4s, v16.4s, v16.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v17.4s, v17.4s, v17.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v18.4s, v18.4s, v18.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v19.4s, v19.4s, v19.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v20.4s, v20.4s, v20.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v16.4s, v16.4s, v16.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v17.4s, v17.4s, v17.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v18.4s, v18.4s, v18.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v19.4s, v19.4s, v19.4s
  cbz x0, mixjmpvecmultest_jellydonut
  mul v20.4s, v20.4s, v20.4s
  cbz x0, mixjmpvecmultest_jellydonut
  sub x0, x0, x14
  cbnz x0, mixjmpvecmultest_loop
mixjmpvecmultest_jellydonut:
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecloadtest:
vecloadtest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
vecloadtest_loop:
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  sub x0, x0, x14
  cbnz x0, vecloadtest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_vecstoretest:
vecstoretest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
vecstoretest_loop:
  str q16, [x2]
  str q17, [x2]
  str q18, [x2]
  str q19, [x2]
  str q20, [x2]
  str q16, [x2]
  str q17, [x2]
  str q18, [x2]
  str q19, [x2]
  str q20, [x2]
  str q16, [x2]
  str q17, [x2]
  str q18, [x2]
  str q19, [x2]
  str q20, [x2]
  str q16, [x2]
  str q17, [x2]
  str q18, [x2]
  str q19, [x2]
  str q20, [x2]
  sub x0, x0, x14
  cbnz x0, vecstoretest_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_loadtest:
loadtest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x14, 20
loadtest_loop:
  ldr x10, [x1]
  ldr x11, [x1]
  ldr x12, [x1]
  ldr x13, [x1]
  ldr x15, [x1]
  ldr x10, [x1]
  ldr x11, [x1]
  ldr x12, [x1]
  ldr x13, [x1]
  ldr x15, [x1]
  ldr x10, [x1]
  ldr x11, [x1]
  ldr x12, [x1]
  ldr x13, [x1]
  ldr x15, [x1]
  ldr x10, [x1]
  ldr x11, [x1]
  ldr x12, [x1]
  ldr x13, [x1]
  ldr x15, [x1]
  sub x0, x0, x14
  cbnz x0, loadtest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixloadstoretest:
mixloadstoretest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x14, 20
mixloadstoretest_loop:
  ldr x10, [x1]
  str x14, [x2]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  str x14, [x2]
  ldr x13, [x1]
  str x14, [x2]
  ldr x15, [x1]
  str x14, [x2]
  ldr x10, [x1]
  str x14, [x2]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  str x14, [x2]
  ldr x13, [x1]
  str x14, [x2]
  ldr x15, [x1]
  str x14, [x2]
  sub x0, x0, x14
  cbnz x0, mixloadstoretest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mix21loadstoretest:
mix21loadstoretest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x14, 30
mix21loadstoretest_loop:
  ldr x10, [x1]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  ldr x13, [x1]
  str x14, [x2]
  ldr x10, [x1]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  ldr x13, [x1]
  str x14, [x2]
  ldr x10, [x1]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  ldr x13, [x1]
  str x14, [x2]
  ldr x10, [x1]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  ldr x13, [x1]
  str x14, [x2]
  ldr x10, [x1]
  ldr x11, [x1]
  str x14, [x2]
  ldr x12, [x1]
  ldr x13, [x1]
  str x14, [x2]
  sub x0, x0, x14
  cbnz x0, mix21loadstoretest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_jmptest:
jmptest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
jmptest_loop:
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  cbz x0, jmptest_jellydonut
  sub x0, x0, x14
  cbnz x0, jmptest_loop
jmptest_jellydonut:
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_fusejmptest:
fusejmptest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 20
fusejmptest_loop:
  nop
  nop
  cmp x0, 0
  b.eq jmptest_jellydonut
  nop
  nop
  cmp x0, 0
  b.eq jmptest_jellydonut
  nop
  nop
  cmp x0, 0
  b.eq jmptest_jellydonut
  nop
  nop
  cmp x0, 0
  b.eq jmptest_jellydonut
  nop
  sub x0, x0, x14
  cmp x0, 0
  b.ne fusejmptest_loop
fusejmptest_jellydonut:
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

_mixmuljmptest:
mixmuljmptest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x14, 20
mixmuljmptest_loop:
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  sub x0, x0, x14
  cbnz x0, mixmuljmptest_loop
mixmuljmptest_jellydonut:
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixmuljmptest21:
mixmuljmptest21:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x14, 30
mixmuljmptest21_loop:
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  cbz x0, mixmuljmptest21_jellydonut
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  mul x9, x9, x15
  mul x8, x8, x15
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  sub x0, x0, x14
  cbnz x0, mixmuljmptest21_loop
mixmuljmptest21_jellydonut:
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixaddjmptest:
mixaddjmptest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x14, 20
mixaddjmptest_loop:
  add x10, x10, x15
  add x11, x11, x15
  add x12, x12, x15
  add x13, x13, x15
  add x9, x9, x15
  add x8, x8, x15
  add x10, x10, x15
  add x11, x11, x15
  add x12, x12, x15
  add x13, x13, x15
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixaddjmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  cbz x0, mixmuljmptest_jellydonut
  sub x0, x0, x14
  cbnz x0, mixmuljmptest_loop
mixaddjmptest_jellydonut:
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_mixaddjmp21test:
mixaddjmp21test:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40] 
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x14, 15
mixaddjmp21test_loop:
  add x10, x10, x15
  add x11, x11, x15
  cbz x0, mixaddjmp21test_jellydonut
  
  add x12, x12, x15
  add x13, x13, x15
  cbz x0, mixaddjmp21test_jellydonut
  
  add x9, x9, x15
  add x8, x8, x15
  cbz x0, mixaddjmp21test_jellydonut
  
  add x10, x10, x15
  add x11, x11, x15
  cbz x0, mixaddjmp21test_jellydonut
  
  add x12, x12, x15
  add x13, x13, x15
  cbz x0, mixaddjmp21test_jellydonut

  sub x0, x0, x14
  cmp x0, 0
  b.gt mixaddjmp21test_loop
mixaddjmp21test_jellydonut:
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50 
  ret

_mixmulrortest:
mixmulrortest:
  sub sp, sp, #0x80
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  stp x19, x20, [sp, #0x50]
  stp x21, x22, [sp, #0x60]
  stp x23, x24, [sp, #0x70]
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x19, x8
  mov x20, x8
  mov x21, x8
  mov x22, x8
  mov x23, x8
  mov x24, x8
  mov x14, 20
mixmulrortest_loop:
  ror x24, x24, 1
  ror x23, x23, 1
  ror x22, x22, 1
  ror x21, x21, 1
  ror x20, x20, 1
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  mul x9, x9, x15
  ror x24, x24, 1
  ror x23, x23, 1
  ror x22, x22, 1
  ror x21, x21, 1
  ror x20, x20, 1
  mul x8, x8, x15
  mul x10, x10, x15
  mul x11, x11, x15
  mul x12, x12, x15
  mul x13, x13, x15
  sub x0, x0, x14
  cbnz x0, mixmulrortest_loop
  ldp x23, x24, [sp, #0x70]
  ldp x21, x22, [sp, #0x60]
  ldp x19, x20, [sp, #0x50]
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x80
  ret

_rortest:
rortest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x8, 7
  mov x9, 6
  mov x10, 1
  mov x11, 2
  mov x12, 3
  mov x13, 4
  mov x15, 5
  mov x14, 20
rortest_loop:
  ror x10, x10, 1
  ror x11, x11, 1
  ror x12, x12, 1
  ror x13, x13, 1
  ror x9, x9, 1
  ror x8, x8, 1
  ror x10, x10, 1
  ror x11, x11, 1
  ror x12, x12, 1
  ror x13, x13, 1
  ror x10, x10, 1
  ror x11, x11, 1
  ror x12, x12, 1
  ror x13, x13, 1
  ror x9, x9, 1
  ror x8, x8, 1
  ror x10, x10, 1
  ror x11, x11, 1
  ror x12, x12, 1
  ror x13, x13, 1
  sub x0, x0, x14
  cbnz x0, rortest_loop
rortest_jellydonut:
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_depmovtest:
depmovtest:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  mov x15, 1
  mov x14, 20
  eor x13, x13, x13
depmovtest_loop:
  mov x12, x15
  mov x10, x12
  mov x13, x10
  mov x11, x13
  mov x15, x11
  mov x12, x15
  mov x10, x12
  mov x13, x10
  mov x11, x13
  mov x15, x11
  mov x12, x15
  mov x10, x12
  mov x13, x10
  mov x11, x13
  mov x15, x11
  mov x12, x15
  mov x10, x12
  mov x13, x10
  mov x11, x13
  mov x15, x11
  sub x0, x0, x14
  cbnz x0, depmovtest_loop
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret

_indepmovtest:
indepmovtest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 20
  eor x13, x13, x13
indepmovtest_loop:
  mov x10, x15
  mov x11, x14
  mov x12, x13
  mov x9, x15
  mov x8, x14
  mov x10, x15
  mov x11, x14
  mov x12, x13
  mov x9, x15
  mov x8, x14
  mov x10, x15
  mov x11, x14
  mov x12, x13
  mov x9, x15
  mov x8, x14
  mov x10, x15
  mov x11, x14
  mov x12, x13
  mov x9, x15
  mov x8, x14
  sub x0, x0, x14
  cbnz x0, indepmovtest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_xorzerotest:
xorzerotest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 20
xorzerotest_loop:
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  eor x15, x15, x15
  sub x0, x0, x14
  cbnz x0, xorzerotest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_movzerotest:
movzerotest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 20
movzerotest_loop:
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  mov x15, 0
  sub x0, x0, x14
  cbnz x0, movzerotest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_subzerotest:
subzerotest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x15, 1
  mov x14, 20
subzerotest_loop:
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x15, x15, x15
  sub x0, x0, x14
  cbnz x0, subzerotest_loop
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

_aesetest:
aesetest:
  sub sp, sp, #0x50
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
  mov x14, 20
aesetest_loop:
  aese v0.16b, v16.16b
  aese v1.16b, v17.16b
  aese v2.16b, v18.16b
  aese v3.16b, v19.16b
  aese v4.16b, v20.16b
  aese v0.16b, v16.16b
  aese v1.16b, v17.16b
  aese v2.16b, v18.16b
  aese v3.16b, v19.16b
  aese v4.16b, v20.16b 
  aese v0.16b, v16.16b
  aese v1.16b, v17.16b
  aese v2.16b, v18.16b
  aese v3.16b, v19.16b
  aese v4.16b, v20.16b  
  aese v0.16b, v16.16b
  aese v1.16b, v17.16b
  aese v2.16b, v18.16b
  aese v3.16b, v19.16b
  aese v4.16b, v20.16b 
  sub x0, x0, x14
  cbnz x0, aesetest_loop
  add sp, sp, #0x50
  ret

_mixaesevecadd128test:
mixaesevecadd128test:
  sub sp, sp, #0x50
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
  mov x14, 20
mixaesevecadd128test_loop:
  aese v0.16b, v16.16b
  add v5.4s, v9.4s, v16.4s
  aese v1.16b, v17.16b
  add v6.4s, v10.4s, v16.4s
  aese v2.16b, v18.16b
  add v7.4s, v11.4s, v16.4s
  aese v3.16b, v19.16b
  add v31.4s, v12.4s, v16.4s
  aese v4.16b, v20.16b
  add v30.4s, v13.4s, v16.4s
  aese v0.16b, v16.16b
  add v5.4s, v9.4s, v16.4s
  aese v1.16b, v17.16b
  add v6.4s, v10.4s, v16.4s
  aese v2.16b, v18.16b
  add v7.4s, v11.4s, v16.4s
  aese v3.16b, v19.16b
  add v31.4s, v12.4s, v16.4s
  aese v4.16b, v20.16b
  add v30.4s, v13.4s, v16.4s 
  sub x0, x0, x14
  cbnz x0, mixaesevecadd128test_loop
  add sp, sp, #0x50
  ret 

_pmulltest:
pmulltest:
  sub sp, sp, #0x50
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
  mov x14, 20
pmulltest_loop:
  pmull v0.1q, v16.1d, v17.1d
  pmull v1.1q, v16.1d, v17.1d
  pmull v2.1q, v16.1d, v17.1d
  pmull v3.1q, v16.1d, v17.1d
  pmull v4.1q, v16.1d, v17.1d
  pmull v0.1q, v16.1d, v17.1d
  pmull v1.1q, v16.1d, v17.1d
  pmull v2.1q, v16.1d, v17.1d
  pmull v3.1q, v16.1d, v17.1d
  pmull v4.1q, v16.1d, v17.1d 
  pmull v0.1q, v16.1d, v17.1d
  pmull v1.1q, v16.1d, v17.1d
  pmull v2.1q, v16.1d, v17.1d
  pmull v3.1q, v16.1d, v17.1d
  pmull v4.1q, v16.1d, v17.1d 
  pmull v0.1q, v16.1d, v17.1d
  pmull v1.1q, v16.1d, v17.1d
  pmull v2.1q, v16.1d, v17.1d
  pmull v3.1q, v16.1d, v17.1d
  pmull v4.1q, v16.1d, v17.1d 
  sub x0, x0, x14
  cbnz x0, pmulltest_loop
  add sp, sp, #0x50
  ret 

_mixpmulladd128test:
mixpmulladd128test:
  sub sp, sp, #0x50
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
  mov x14, 20
mixpmulladd128test_loop:
  pmull v0.1q, v16.1d, v17.1d
  add v5.4s, v9.4s, v16.4s
  pmull v1.1q, v16.1d, v17.1d
  add v6.4s, v9.4s, v16.4s
  pmull v2.1q, v16.1d, v17.1d
  add v7.4s, v9.4s, v16.4s
  pmull v3.1q, v16.1d, v17.1d
  add v31.4s, v9.4s, v16.4s
  pmull v4.1q, v16.1d, v17.1d
  add v30.4s, v9.4s, v16.4s
  pmull v0.1q, v16.1d, v17.1d
  add v5.4s, v9.4s, v16.4s
  pmull v1.1q, v16.1d, v17.1d
  add v6.4s, v9.4s, v16.4s
  pmull v2.1q, v16.1d, v17.1d
  add v7.4s, v9.4s, v16.4s
  pmull v3.1q, v16.1d, v17.1d
  add v31.4s, v9.4s, v16.4s
  pmull v4.1q, v16.1d, v17.1d
  add v30.4s, v9.4s, v16.4s 
  sub x0, x0, x14
  cbnz x0, mixpmulladd128test_loop
  add sp, sp, #0x50
  ret 


================================================
FILE: InstructionRate/riscv_instructionrate.c
================================================
#define  _GNU_SOURCE
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <string.h>

float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *));

extern uint64_t clktest(uint64_t iterations, void *data);
extern uint64_t addtest(uint64_t iterations, void *data);
extern uint64_t faddtest(uint64_t iterations, void *data);
extern uint64_t fmultest(uint64_t iterations, void *data);
extern uint64_t mixfaddfmultest(uint64_t iterations, void *data);
extern uint64_t fmatest(uint64_t iterations, void *data);
extern uint64_t faddlattest(uint64_t iterations, void *data);
extern uint64_t fmullattest(uint64_t iterations, void *data);
extern uint64_t fmalattest(uint64_t iterations, void *data);

float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };
int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };
int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };

int main(int argc, char *argv[]) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t iterations = 1500000000;
  uint64_t iterationsHigh = iterations * 5;
  uint64_t time_diff_ms;
  float latency, opsPerNs, clockSpeedGhz;
  if (argc > 1) {
    for (int argIdx = 1; argIdx < argc; argIdx++) {
      if (*(argv[argIdx]) == '-') {
        char *arg = argv[argIdx] + 1;
        if (strncmp(arg, "iter", 4) == 0) {
	  argIdx++;
	  int iterMul = atoi(argv[argIdx]);
	  iterations *= iterMul;
	  iterationsHigh *= iterMul;
	  fprintf(stderr, "Scaled iterations by %d\n", iterMul);
	}
      }
    }
  }

  gettimeofday(&startTv, &startTz);
  clktest(iterations, NULL);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  // clk speed should be 1/latency, assuming we got one add per clk, roughly
  clockSpeedGhz = 1/latency;
  printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz);

  // integer side
  printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, NULL, addtest));

  // FP
  printf("FP32 Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, faddtest));
  printf("FP32 Add latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, faddlattest));
  printf("FP32 Multiplies per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmultest));
  printf("FP32 Multiply latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmullattest));
  printf("1:1 FP32 Add:Mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, mixfaddfmultest));
  printf("FP32 FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmatest));
  printf("FP32 FMA latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmalattest));

  return 0;
}

float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *)) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t time_diff_ms, retval;
  float latency, opsPerNs;

  gettimeofday(&startTv, &startTz);
  retval = testfunc(iterations, arr);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  opsPerNs = 1/latency;
  //printf("return value: %lu\n", retval);
  return opsPerNs / clockSpeedGhz;
}


================================================
FILE: InstructionRate/riscv_instructionrate.s
================================================
.text

.global clktest
.global addtest
.global faddtest
.global fmultest
.global mixfaddfmultest
.global fmatest
.global faddlattest
.global fmullattest
.global fmalattest

/* a0 = iterations, a1 = data arr */
clktest:
  mv t0, x0
  mv t1, x0
  addi t1, t1, 1
clktest_loop:
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  add t0, t0, t1
  addi a0, a0, -20
  blt x0, a0, clktest_loop
  ret

addtest:
  mv t0, x0
  addi t0, t0, 1
  mv t1, t0
  mv t2, t0
  mv t3, t0
  mv t4, t0
  mv t5, t0
  mv t6, t0
addtest_loop:
  add t1, t1, t6
  add t2, t2, t6
  add t3, t3, t6
  add t4, t4, t6
  add t5, t5, t6
  add t1, t1, t6
  add t2, t2, t6
  add t3, t3, t6
  add t4, t4, t6
  add t5, t5, t6 
  add t1, t1, t6
  add t2, t2, t6
  add t3, t3, t6
  add t4, t4, t6
  add t5, t5, t6
  add t1, t1, t6
  add t2, t2, t6
  add t3, t3, t6
  add t4, t4, t6
  add t5, t5, t6  
  addi a0, a0, -20
  blt x0, a0, addtest_loop
  ret

/* f0-7 are fp temporaries */
faddtest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
faddtest_loop:
  fadd.d f1, f1, f0
  fadd.d f2, f2, f0
  fadd.d f3, f3, f0
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
  fadd.d f1, f1, f0
  fadd.d f2, f2, f0
  fadd.d f3, f3, f0
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0 
  addi a0, a0, -14
  blt x0, a0, faddtest_loop
  ret

faddlattest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
faddlattest_loop:
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1
  fadd.d f1, f1, f1 
  addi a0, a0, -14
  blt x0, a0, faddlattest_loop
  ret 

fmultest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
fmultest_loop:
  fmul.d f1, f1, f0
  fmul.d f2, f2, f0
  fmul.d f3, f3, f0
  fmul.d f4, f4, f0
  fmul.d f5, f5, f0
  fmul.d f6, f6, f0
  fmul.d f7, f7, f0
  fmul.d f1, f1, f0
  fmul.d f2, f2, f0
  fmul.d f3, f3, f0
  fmul.d f4, f4, f0
  fmul.d f5, f5, f0
  fmul.d f6, f6, f0
  fmul.d f7, f7, f0 
  addi a0, a0, -14
  blt x0, a0, fmultest_loop
  ret 

fmullattest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
fmullattest_loop:
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1
  fmul.d f1, f1, f1 
  addi a0, a0, -14
  blt x0, a0, fmullattest_loop
  ret

mixfaddfmultest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
mixfaddfmultest_loop:
  fadd.d f1, f1, f0
  fmul.d f2, f2, f0
  fadd.d f3, f3, f0
  fmul.d f4, f4, f0
  fadd.d f5, f5, f0
  fmul.d f6, f6, f0
  fadd.d f7, f7, f0
  fmul.d f1, f1, f0
  fadd.d f2, f2, f0
  fmul.d f3, f3, f0
  fadd.d f4, f4, f0
  fmul.d f5, f5, f0
  fadd.d f6, f6, f0
  fmul.d f7, f7, f0 
  addi a0, a0, -14
  blt x0, a0, mixfaddfmultest_loop
  ret  

fmatest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
fmatest_loop:
  fmadd.d f1, f1, f1, f0
  fmadd.d f2, f2, f2, f0
  fmadd.d f3, f3, f3, f0
  fmadd.d f4, f4, f4, f0
  fmadd.d f5, f5, f5, f0
  fmadd.d f6, f6, f6, f0
  fmadd.d f7, f7, f7, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f2, f2, f2, f0
  fmadd.d f3, f3, f3, f0
  fmadd.d f4, f4, f4, f0
  fmadd.d f5, f5, f5, f0
  fmadd.d f6, f6, f6, f0
  fmadd.d f7, f7, f7, f0 
  addi a0, a0, -14
  blt x0, a0, fmatest_loop
  ret  

fmalattest:
  flw f0, (a1)
  flw f1, 4(a1)
  flw f2, 8(a1)
  flw f3, 12(a1)
  fsub.d f4, f4, f4
  fsub.d f5, f5, f5
  fsub.d f6, f6, f6
  fsub.d f7, f7, f7
  fadd.d f4, f4, f0
  fadd.d f5, f5, f0
  fadd.d f6, f6, f0
  fadd.d f7, f7, f0
fmalattest_loop:
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0
  fmadd.d f1, f1, f1, f0 
  addi a0, a0, -14
  blt x0, a0, fmalattest_loop
  ret


================================================
FILE: InstructionRate/test.s
================================================

x86_instructionrate:     file format elf64-x86-64


Disassembly of section .init:

0000000000001000 <_init>:
    1000:	f3 0f 1e fa          	endbr64 
    1004:	48 83 ec 08          	sub    $0x8,%rsp
    1008:	48 8b 05 d9 cf 00 00 	mov    0xcfd9(%rip),%rax        # dfe8 <__gmon_start__>
    100f:	48 85 c0             	test   %rax,%rax
    1012:	74 02                	je     1016 <_init+0x16>
    1014:	ff d0                	callq  *%rax
    1016:	48 83 c4 08          	add    $0x8,%rsp
    101a:	c3                   	retq   

Disassembly of section .plt:

0000000000001020 <.plt>:
    1020:	ff 35 62 cf 00 00    	pushq  0xcf62(%rip)        # df88 <_GLOBAL_OFFSET_TABLE_+0x8>
    1026:	ff 25 64 cf 00 00    	jmpq   *0xcf64(%rip)        # df90 <_GLOBAL_OFFSET_TABLE_+0x10>
    102c:	0f 1f 40 00          	nopl   0x0(%rax)

0000000000001030 <strncmp@plt>:
    1030:	ff 25 62 cf 00 00    	jmpq   *0xcf62(%rip)        # df98 <strncmp@GLIBC_2.2.5>
    1036:	68 00 00 00 00       	pushq  $0x0
    103b:	e9 e0 ff ff ff       	jmpq   1020 <.plt>

0000000000001040 <__stack_chk_fail@plt>:
    1040:	ff 25 5a cf 00 00    	jmpq   *0xcf5a(%rip)        # dfa0 <__stack_chk_fail@GLIBC_2.4>
    1046:	68 01 00 00 00       	pushq  $0x1
    104b:	e9 d0 ff ff ff       	jmpq   1020 <.plt>

0000000000001050 <gettimeofday@plt>:
    1050:	ff 25 52 cf 00 00    	jmpq   *0xcf52(%rip)        # dfa8 <gettimeofday@GLIBC_2.2.5>
    1056:	68 02 00 00 00       	pushq  $0x2
    105b:	e9 c0 ff ff ff       	jmpq   1020 <.plt>

0000000000001060 <strcmp@plt>:
    1060:	ff 25 4a cf 00 00    	jmpq   *0xcf4a(%rip)        # dfb0 <strcmp@GLIBC_2.2.5>
    1066:	68 03 00 00 00       	pushq  $0x3
    106b:	e9 b0 ff ff ff       	jmpq   1020 <.plt>

0000000000001070 <strtol@plt>:
    1070:	ff 25 42 cf 00 00    	jmpq   *0xcf42(%rip)        # dfb8 <strtol@GLIBC_2.2.5>
    1076:	68 04 00 00 00       	pushq  $0x4
    107b:	e9 a0 ff ff ff       	jmpq   1020 <.plt>

0000000000001080 <__printf_chk@plt>:
    1080:	ff 25 3a cf 00 00    	jmpq   *0xcf3a(%rip)        # dfc0 <__printf_chk@GLIBC_2.3.4>
    1086:	68 05 00 00 00       	pushq  $0x5
    108b:	e9 90 ff ff ff       	jmpq   1020 <.plt>

0000000000001090 <fwrite@plt>:
    1090:	ff 25 32 cf 00 00    	jmpq   *0xcf32(%rip)        # dfc8 <fwrite@GLIBC_2.2.5>
    1096:	68 06 00 00 00       	pushq  $0x6
    109b:	e9 80 ff ff ff       	jmpq   1020 <.plt>

00000000000010a0 <aligned_alloc@plt>:
    10a0:	ff 25 2a cf 00 00    	jmpq   *0xcf2a(%rip)        # dfd0 <aligned_alloc@GLIBC_2.16>
    10a6:	68 07 00 00 00       	pushq  $0x7
    10ab:	e9 70 ff ff ff       	jmpq   1020 <.plt>

Disassembly of section .plt.got:

00000000000010b0 <__cxa_finalize@plt>:
    10b0:	ff 25 42 cf 00 00    	jmpq   *0xcf42(%rip)        # dff8 <__cxa_finalize@GLIBC_2.2.5>
    10b6:	66 90                	xchg   %ax,%ax

Disassembly of section .text:

00000000000010c0 <main>:
    10c0:	f3 0f 1e fa          	endbr64 
    10c4:	41 57                	push   %r15
    10c6:	41 56                	push   %r14
    10c8:	41 55                	push   %r13
    10ca:	41 54                	push   %r12
    10cc:	41 89 fc             	mov    %edi,%r12d
    10cf:	bf 40 00 00 00       	mov    $0x40,%edi
    10d4:	55                   	push   %rbp
    10d5:	48 89 f5             	mov    %rsi,%rbp
    10d8:	be 00 10 00 00       	mov    $0x1000,%esi
    10dd:	53                   	push   %rbx
    10de:	48 83 ec 58          	sub    $0x58,%rsp
    10e2:	64 48 8b 04 25 28 00 	mov    %fs:0x28,%rax
    10e9:	00 00 
    10eb:	48 89 44 24 48       	mov    %rax,0x48(%rsp)
    10f0:	31 c0                	xor    %eax,%eax
    10f2:	e8 a9 ff ff ff       	callq  10a0 <aligned_alloc@plt>
    10f7:	66 0f 6f 0d 91 af 00 	movdqa 0xaf91(%rip),%xmm1        # c090 <_IO_stdin_used+0x1090>
    10fe:	00 
    10ff:	66 0f 6f 25 99 af 00 	movdqa 0xaf99(%rip),%xmm4        # c0a0 <_IO_stdin_used+0x10a0>
    1106:	00 
    1107:	48 89 05 f2 cf 00 00 	mov    %rax,0xcff2(%rip)        # e100 <intTestArr>
    110e:	66 0f 6f 1d 9a af 00 	movdqa 0xaf9a(%rip),%xmm3        # c0b0 <_IO_stdin_used+0x10b0>
    1115:	00 
    1116:	48 8d 90 00 10 00 00 	lea    0x1000(%rax),%rdx
    111d:	0f 1f 00             	nopl   (%rax)
    1120:	66 0f 6f c1          	movdqa %xmm1,%xmm0
    1124:	48 83 c0 10          	add    $0x10,%rax
    1128:	66 0f d4 cc          	paddq  %xmm4,%xmm1
    112c:	66 0f 6f d0          	movdqa %xmm0,%xmm2
    1130:	66 0f d4 d3          	paddq  %xmm3,%xmm2
    1134:	0f c6 c2 88          	shufps $0x88,%xmm2,%xmm0
    1138:	0f 29 40 f0          	movaps %xmm0,-0x10(%rax)
    113c:	48 39 c2             	cmp    %rax,%rdx
    113f:	75 df                	jne    1120 <main+0x60>
    1141:	49 be 00 eb 08 bf 01 	movabs $0x1bf08eb00,%r14
    1148:	00 00 00 
    114b:	41 83 fc 02          	cmp    $0x2,%r12d
    114f:	0f 8f db 35 00 00    	jg     4730 <main+0x3670>
    1155:	4c 8d 2d 94 cf 00 00 	lea    0xcf94(%rip),%r13        # e0f0 <__cpu_model>
    115c:	41 f6 45 0d 02       	testb  $0x2,0xd(%r13)
    1161:	0f 85 a7 35 00 00    	jne    470e <main+0x364e>
    1167:	41 f6 45 0d 04       	testb  $0x4,0xd(%r13)
    116c:	0f 85 7a 35 00 00    	jne    46ec <main+0x362c>
    1172:	41 f6 45 0e 02       	testb  $0x2,0xe(%r13)
    1177:	0f 85 4d 35 00 00    	jne    46ca <main+0x360a>
    117d:	b8 07 00 00 00       	mov    $0x7,%eax
    1182:	31 c9                	xor    %ecx,%ecx
    1184:	0f a2                	cpuid  
    1186:	81 e3 00 00 01 00    	and    $0x10000,%ebx
    118c:	0f 85 af 1f 00 00    	jne    3141 <main+0x2081>
    1192:	41 83 fc 01          	cmp    $0x1,%r12d
    1196:	0f 84 9c 47 00 00    	je     5938 <main+0x4878>
    119c:	f2 0f 10 05 dc ae 00 	movsd  0xaedc(%rip),%xmm0        # c080 <_IO_stdin_used+0x1080>
    11a3:	00 
    11a4:	bf 01 00 00 00       	mov    $0x1,%edi
    11a9:	b8 01 00 00 00       	mov    $0x1,%eax
    11ae:	48 8d 35 c3 a5 00 00 	lea    0xa5c3(%rip),%rsi        # b778 <_IO_stdin_used+0x778>
    11b5:	e8 c6 fe ff ff       	callq  1080 <__printf_chk@plt>
    11ba:	f3 0f 10 35 c6 ae 00 	movss  0xaec6(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>
    11c1:	00 
    11c2:	f3 0f 11 74 24 0c    	movss  %xmm6,0xc(%rsp)
    11c8:	f3 0f 11 74 24 08    	movss  %xmm6,0x8(%rsp)
    11ce:	41 83 fc 01          	cmp    $0x1,%r12d
    11d2:	0f 8e 6e 17 00 00    	jle    2946 <main+0x1886>
    11d8:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    11dc:	ba 05 00 00 00       	mov    $0x5,%edx
    11e1:	48 8d 35 86 a5 00 00 	lea    0xa586(%rip),%rsi        # b76e <_IO_stdin_used+0x76e>
    11e8:	4c 89 ef             	mov    %r13,%rdi
    11eb:	e8 40 fe ff ff       	callq  1030 <strncmp@plt>
    11f0:	85 c0                	test   %eax,%eax
    11f2:	0f 85 a3 17 00 00    	jne    299b <main+0x18db>
    11f8:	48 8d 35 79 5b 00 00 	lea    0x5b79(%rip),%rsi        # 6d78 <noptest1b>
    11ff:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1205:	4c 89 f7             	mov    %r14,%rdi
    1208:	e8 33 98 00 00       	callq  aa40 <measureFunction>
    120d:	bf 01 00 00 00       	mov    $0x1,%edi
    1212:	b8 01 00 00 00       	mov    $0x1,%eax
    1217:	48 8d 35 2e a5 00 00 	lea    0xa52e(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>
    121e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1222:	e8 59 fe ff ff       	callq  1080 <__printf_chk@plt>
    1227:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    122b:	ba 05 00 00 00       	mov    $0x5,%edx
    1230:	48 8d 35 8e 9f 00 00 	lea    0x9f8e(%rip),%rsi        # b1c5 <_IO_stdin_used+0x1c5>
    1237:	4c 89 ef             	mov    %r13,%rdi
    123a:	e8 f1 fd ff ff       	callq  1030 <strncmp@plt>
    123f:	85 c0                	test   %eax,%eax
    1241:	0f 85 70 17 00 00    	jne    29b7 <main+0x18f7>
    1247:	48 8d 35 f1 5a 00 00 	lea    0x5af1(%rip),%rsi        # 6d3f <noptest>
    124e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1254:	4c 89 f7             	mov    %r14,%rdi
    1257:	e8 e4 97 00 00       	callq  aa40 <measureFunction>
    125c:	bf 01 00 00 00       	mov    $0x1,%edi
    1261:	b8 01 00 00 00       	mov    $0x1,%eax
    1266:	48 8d 35 c4 a4 00 00 	lea    0xa4c4(%rip),%rsi        # b731 <_IO_stdin_used+0x731>
    126d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1271:	e8 0a fe ff ff       	callq  1080 <__printf_chk@plt>
    1276:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    127a:	ba 03 00 00 00       	mov    $0x3,%edx
    127f:	48 8d 35 89 9f 00 00 	lea    0x9f89(%rip),%rsi        # b20f <_IO_stdin_used+0x20f>
    1286:	4c 89 ef             	mov    %r13,%rdi
    1289:	e8 a2 fd ff ff       	callq  1030 <strncmp@plt>
    128e:	85 c0                	test   %eax,%eax
    1290:	0f 85 3d 17 00 00    	jne    29d3 <main+0x1913>
    1296:	48 8d 35 01 5b 00 00 	lea    0x5b01(%rip),%rsi        # 6d9e <addtest>
    129d:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    12a3:	4c 89 f7             	mov    %r14,%rdi
    12a6:	e8 95 97 00 00       	callq  aa40 <measureFunction>
    12ab:	bf 01 00 00 00       	mov    $0x1,%edi
    12b0:	b8 01 00 00 00       	mov    $0x1,%eax
    12b5:	48 8d 35 61 a4 00 00 	lea    0xa461(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>
    12bc:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    12c0:	e8 bb fd ff ff       	callq  1080 <__printf_chk@plt>
    12c5:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    12c9:	48 8d 35 fb 9e 00 00 	lea    0x9efb(%rip),%rsi        # b1cb <_IO_stdin_used+0x1cb>
    12d0:	4c 89 ef             	mov    %r13,%rdi
    12d3:	e8 88 fd ff ff       	callq  1060 <strcmp@plt>
    12d8:	85 c0                	test   %eax,%eax
    12da:	0f 85 0a 17 00 00    	jne    29ea <main+0x192a>
    12e0:	4c 8d 3d 43 5b 00 00 	lea    0x5b43(%rip),%r15        # 6e2a <addnoptest>
    12e7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    12ed:	4c 89 f7             	mov    %r14,%rdi
    12f0:	4c 89 fe             	mov    %r15,%rsi
    12f3:	e8 48 97 00 00       	callq  aa40 <measureFunction>
    12f8:	bf 01 00 00 00       	mov    $0x1,%edi
    12fd:	b8 01 00 00 00       	mov    $0x1,%eax
    1302:	48 8d 35 f7 a3 00 00 	lea    0xa3f7(%rip),%rsi        # b700 <_IO_stdin_used+0x700>
    1309:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    130d:	e8 6e fd ff ff       	callq  1080 <__printf_chk@plt>
    1312:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1316:	48 8d 35 b5 9e 00 00 	lea    0x9eb5(%rip),%rsi        # b1d2 <_IO_stdin_used+0x1d2>
    131d:	4c 89 ef             	mov    %r13,%rdi
    1320:	e8 3b fd ff ff       	callq  1060 <strcmp@plt>
    1325:	85 c0                	test   %eax,%eax
    1327:	0f 85 d4 16 00 00    	jne    2a01 <main+0x1941>
    132d:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1333:	4c 89 fe             	mov    %r15,%rsi
    1336:	4c 89 f7             	mov    %r14,%rdi
    1339:	e8 02 97 00 00       	callq  aa40 <measureFunction>
    133e:	bf 01 00 00 00       	mov    $0x1,%edi
    1343:	b8 01 00 00 00       	mov    $0x1,%eax
    1348:	48 8d 35 94 a3 00 00 	lea    0xa394(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>
    134f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1353:	e8 28 fd ff ff       	callq  1080 <__printf_chk@plt>
    1358:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    135c:	ba 06 00 00 00       	mov    $0x6,%edx
    1361:	48 8d 35 73 9e 00 00 	lea    0x9e73(%rip),%rsi        # b1db <_IO_stdin_used+0x1db>
    1368:	4c 89 ef             	mov    %r13,%rdi
    136b:	e8 c0 fc ff ff       	callq  1030 <strncmp@plt>
    1370:	85 c0                	test   %eax,%eax
    1372:	0f 85 a5 16 00 00    	jne    2a1d <main+0x195d>
    1378:	48 8d 35 c8 91 00 00 	lea    0x91c8(%rip),%rsi        # a547 <depmovtest>
    137f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1385:	4c 89 f7             	mov    %r14,%rdi
    1388:	e8 b3 96 00 00       	callq  aa40 <measureFunction>
    138d:	bf 01 00 00 00       	mov    $0x1,%edi
    1392:	b8 01 00 00 00       	mov    $0x1,%eax
    1397:	48 8d 35 27 a3 00 00 	lea    0xa327(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>
    139e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    13a2:	e8 d9 fc ff ff       	callq  1080 <__printf_chk@plt>
    13a7:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    13ab:	ba 08 00 00 00       	mov    $0x8,%edx
    13b0:	48 8d 35 22 9e 00 00 	lea    0x9e22(%rip),%rsi        # b1d9 <_IO_stdin_used+0x1d9>
    13b7:	4c 89 ef             	mov    %r13,%rdi
    13ba:	e8 71 fc ff ff       	callq  1030 <strncmp@plt>
    13bf:	85 c0                	test   %eax,%eax
    13c1:	0f 85 72 16 00 00    	jne    2a39 <main+0x1979>
    13c7:	48 8d 35 ee 91 00 00 	lea    0x91ee(%rip),%rsi        # a5bc <indepmovtest>
    13ce:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    13d4:	4c 89 f7             	mov    %r14,%rdi
    13d7:	e8 64 96 00 00       	callq  aa40 <measureFunction>
    13dc:	bf 01 00 00 00       	mov    $0x1,%edi
    13e1:	b8 01 00 00 00       	mov    $0x1,%eax
    13e6:	48 8d 35 3b ac 00 00 	lea    0xac3b(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>
    13ed:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    13f1:	e8 8a fc ff ff       	callq  1080 <__printf_chk@plt>
    13f6:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    13fa:	ba 07 00 00 00       	mov    $0x7,%edx
    13ff:	48 8d 35 dc 9d 00 00 	lea    0x9ddc(%rip),%rsi        # b1e2 <_IO_stdin_used+0x1e2>
    1406:	4c 89 ef             	mov    %r13,%rdi
    1409:	e8 22 fc ff ff       	callq  1030 <strncmp@plt>
    140e:	85 c0                	test   %eax,%eax
    1410:	0f 85 3f 16 00 00    	jne    2a55 <main+0x1995>
    1416:	48 8d 35 e1 92 00 00 	lea    0x92e1(%rip),%rsi        # a6fe <xorzerotest>
    141d:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1423:	4c 89 f7             	mov    %r14,%rdi
    1426:	e8 15 96 00 00       	callq  aa40 <measureFunction>
    142b:	bf 01 00 00 00       	mov    $0x1,%edi
    1430:	b8 01 00 00 00       	mov    $0x1,%eax
    1435:	48 8d 35 71 a2 00 00 	lea    0xa271(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>
    143c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1440:	e8 3b fc ff ff       	callq  1080 <__printf_chk@plt>
    1445:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1449:	ba 07 00 00 00       	mov    $0x7,%edx
    144e:	48 8d 35 95 9d 00 00 	lea    0x9d95(%rip),%rsi        # b1ea <_IO_stdin_used+0x1ea>
    1455:	4c 89 ef             	mov    %r13,%rdi
    1458:	e8 d3 fb ff ff       	callq  1030 <strncmp@plt>
    145d:	85 c0                	test   %eax,%eax
    145f:	0f 85 0c 16 00 00    	jne    2a71 <main+0x19b1>
    1465:	48 8d 35 c7 91 00 00 	lea    0x91c7(%rip),%rsi        # a633 <movzerotest>
    146c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1472:	4c 89 f7             	mov    %r14,%rdi
    1475:	e8 c6 95 00 00       	callq  aa40 <measureFunction>
    147a:	bf 01 00 00 00       	mov    $0x1,%edi
    147f:	b8 01 00 00 00       	mov    $0x1,%eax
    1484:	48 8d 35 0a a2 00 00 	lea    0xa20a(%rip),%rsi        # b695 <_IO_stdin_used+0x695>
    148b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    148f:	e8 ec fb ff ff       	callq  1080 <__printf_chk@plt>
    1494:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1498:	ba 07 00 00 00       	mov    $0x7,%edx
    149d:	48 8d 35 4e 9d 00 00 	lea    0x9d4e(%rip),%rsi        # b1f2 <_IO_stdin_used+0x1f2>
    14a4:	4c 89 ef             	mov    %r13,%rdi
    14a7:	e8 84 fb ff ff       	callq  1030 <strncmp@plt>
    14ac:	85 c0                	test   %eax,%eax
    14ae:	0f 85 d9 15 00 00    	jne    2a8d <main+0x19cd>
    14b4:	48 8d 35 ba 92 00 00 	lea    0x92ba(%rip),%rsi        # a775 <subzerotest>
    14bb:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    14c1:	4c 89 f7             	mov    %r14,%rdi
    14c4:	e8 77 95 00 00       	callq  aa40 <measureFunction>
    14c9:	bf 01 00 00 00       	mov    $0x1,%edi
    14ce:	b8 01 00 00 00       	mov    $0x1,%eax
    14d3:	48 8d 35 a3 a1 00 00 	lea    0xa1a3(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>
    14da:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    14de:	e8 9d fb ff ff       	callq  1080 <__printf_chk@plt>
    14e3:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    14e7:	ba 06 00 00 00       	mov    $0x6,%edx
    14ec:	48 8d 35 07 9d 00 00 	lea    0x9d07(%rip),%rsi        # b1fa <_IO_stdin_used+0x1fa>
    14f3:	4c 89 ef             	mov    %r13,%rdi
    14f6:	e8 35 fb ff ff       	callq  1030 <strncmp@plt>
    14fb:	85 c0                	test   %eax,%eax
    14fd:	0f 85 a6 15 00 00    	jne    2aa9 <main+0x19e9>
    1503:	48 8d 35 6d 93 00 00 	lea    0x936d(%rip),%rsi        # a877 <depinctest>
    150a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1510:	4c 89 f7             	mov    %r14,%rdi
    1513:	e8 28 95 00 00       	callq  aa40 <measureFunction>
    1518:	bf 01 00 00 00       	mov    $0x1,%edi
    151d:	b8 01 00 00 00       	mov    $0x1,%eax
    1522:	48 8d 35 3d a1 00 00 	lea    0xa13d(%rip),%rsi        # b666 <_IO_stdin_used+0x666>
    1529:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    152d:	e8 4e fb ff ff       	callq  1080 <__printf_chk@plt>
    1532:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1536:	ba 06 00 00 00       	mov    $0x6,%edx
    153b:	48 8d 35 bf 9c 00 00 	lea    0x9cbf(%rip),%rsi        # b201 <_IO_stdin_used+0x201>
    1542:	4c 89 ef             	mov    %r13,%rdi
    1545:	e8 e6 fa ff ff       	callq  1030 <strncmp@plt>
    154a:	85 c0                	test   %eax,%eax
    154c:	0f 85 73 15 00 00    	jne    2ac5 <main+0x1a05>
    1552:	48 8d 35 95 93 00 00 	lea    0x9395(%rip),%rsi        # a8ee <depdectest>
    1559:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    155f:	4c 89 f7             	mov    %r14,%rdi
    1562:	e8 d9 94 00 00       	callq  aa40 <measureFunction>
    1567:	bf 01 00 00 00       	mov    $0x1,%edi
    156c:	b8 01 00 00 00       	mov    $0x1,%eax
    1571:	48 8d 35 d7 a0 00 00 	lea    0xa0d7(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>
    1578:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    157c:	e8 ff fa ff ff       	callq  1080 <__printf_chk@plt>
    1581:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1585:	ba 06 00 00 00       	mov    $0x6,%edx
    158a:	48 8d 35 70 9c 00 00 	lea    0x9c70(%rip),%rsi        # b201 <_IO_stdin_used+0x201>
    1591:	4c 89 ef             	mov    %r13,%rdi
    1594:	e8 97 fa ff ff       	callq  1030 <strncmp@plt>
    1599:	85 c0                	test   %eax,%eax
    159b:	75 33                	jne    15d0 <main+0x510>
    159d:	48 8d 35 48 92 00 00 	lea    0x9248(%rip),%rsi        # a7ec <depaddimmtest>
    15a4:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    15aa:	4c 89 f7             	mov    %r14,%rdi
    15ad:	e8 8e 94 00 00       	callq  aa40 <measureFunction>
    15b2:	bf 01 00 00 00       	mov    $0x1,%edi
    15b7:	b8 01 00 00 00       	mov    $0x1,%eax
    15bc:	48 8d 35 3d aa 00 00 	lea    0xaa3d(%rip),%rsi        # c000 <_IO_stdin_used+0x1000>
    15c3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    15c7:	e8 b4 fa ff ff       	callq  1080 <__printf_chk@plt>
    15cc:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    15d0:	ba 06 00 00 00       	mov    $0x6,%edx
    15d5:	48 8d 35 8b a1 00 00 	lea    0xa18b(%rip),%rsi        # b767 <_IO_stdin_used+0x767>
    15dc:	4c 89 ef             	mov    %r13,%rdi
    15df:	e8 4c fa ff ff       	callq  1030 <strncmp@plt>
    15e4:	85 c0                	test   %eax,%eax
    15e6:	0f 85 f5 14 00 00    	jne    2ae1 <main+0x1a21>
    15ec:	48 8d 35 b3 56 00 00 	lea    0x56b3(%rip),%rsi        # 6ca6 <clkmovtest>
    15f3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    15f9:	4c 89 f7             	mov    %r14,%rdi
    15fc:	e8 3f 94 00 00       	callq  aa40 <measureFunction>
    1601:	bf 01 00 00 00       	mov    $0x1,%edi
    1606:	b8 01 00 00 00       	mov    $0x1,%eax
    160b:	48 8d 35 c6 a9 00 00 	lea    0xa9c6(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>
    1612:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1616:	e8 65 fa ff ff       	callq  1080 <__printf_chk@plt>
    161b:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    161f:	ba 0a 00 00 00       	mov    $0xa,%edx
    1624:	48 8d 35 dd 9b 00 00 	lea    0x9bdd(%rip),%rsi        # b208 <_IO_stdin_used+0x208>
    162b:	4c 89 ef             	mov    %r13,%rdi
    162e:	e8 fd f9 ff ff       	callq  1030 <strncmp@plt>
    1633:	85 c0                	test   %eax,%eax
    1635:	0f 85 c2 14 00 00    	jne    2afd <main+0x1a3d>
    163b:	48 8d 35 62 61 00 00 	lea    0x6162(%rip),%rsi        # 77a4 <addmultest>
    1642:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1648:	4c 89 f7             	mov    %r14,%rdi
    164b:	e8 f0 93 00 00       	callq  aa40 <measureFunction>
    1650:	bf 01 00 00 00       	mov    $0x1,%edi
    1655:	b8 01 00 00 00       	mov    $0x1,%eax
    165a:	48 8d 35 d1 9f 00 00 	lea    0x9fd1(%rip),%rsi        # b632 <_IO_stdin_used+0x632>
    1661:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1665:	e8 16 fa ff ff       	callq  1080 <__printf_chk@plt>
    166a:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    166e:	ba 06 00 00 00       	mov    $0x6,%edx
    1673:	48 8d 35 99 9b 00 00 	lea    0x9b99(%rip),%rsi        # b213 <_IO_stdin_used+0x213>
    167a:	4c 89 ef             	mov    %r13,%rdi
    167d:	e8 ae f9 ff ff       	callq  1030 <strncmp@plt>
    1682:	85 c0                	test   %eax,%eax
    1684:	0f 85 8f 14 00 00    	jne    2b19 <main+0x1a59>
    168a:	48 8d 35 56 60 00 00 	lea    0x6056(%rip),%rsi        # 76e7 <jmpmultest>
    1691:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1697:	4c 89 f7             	mov    %r14,%rdi
    169a:	e8 a1 93 00 00       	callq  aa40 <measureFunction>
    169f:	bf 01 00 00 00       	mov    $0x1,%edi
    16a4:	b8 01 00 00 00       	mov    $0x1,%eax
    16a9:	48 8d 35 67 9f 00 00 	lea    0x9f67(%rip),%rsi        # b617 <_IO_stdin_used+0x617>
    16b0:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    16b4:	e8 c7 f9 ff ff       	callq  1080 <__printf_chk@plt>
    16b9:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    16bd:	ba 03 00 00 00       	mov    $0x3,%edx
    16c2:	48 8d 35 53 9b 00 00 	lea    0x9b53(%rip),%rsi        # b21c <_IO_stdin_used+0x21c>
    16c9:	4c 89 ef             	mov    %r13,%rdi
    16cc:	e8 5f f9 ff ff       	callq  1030 <strncmp@plt>
    16d1:	85 c0                	test   %eax,%eax
    16d3:	0f 85 5c 14 00 00    	jne    2b35 <main+0x1a75>
    16d9:	48 8d 35 11 5e 00 00 	lea    0x5e11(%rip),%rsi        # 74f1 <jmptest>
    16e0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    16e6:	4c 89 f7             	mov    %r14,%rdi
    16e9:	e8 52 93 00 00       	callq  aa40 <measureFunction>
    16ee:	bf 01 00 00 00       	mov    $0x1,%edi
    16f3:	b8 01 00 00 00       	mov    $0x1,%eax
    16f8:	48 8d 35 ff 9e 00 00 	lea    0x9eff(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>
    16ff:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1703:	e8 78 f9 ff ff       	callq  1080 <__printf_chk@plt>
    1708:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    170c:	ba 05 00 00 00       	mov    $0x5,%edx
    1711:	48 8d 35 02 9b 00 00 	lea    0x9b02(%rip),%rsi        # b21a <_IO_stdin_used+0x21a>
    1718:	4c 89 ef             	mov    %r13,%rdi
    171b:	e8 10 f9 ff ff       	callq  1030 <strncmp@plt>
    1720:	85 c0                	test   %eax,%eax
    1722:	0f 85 29 14 00 00    	jne    2b51 <main+0x1a91>
    1728:	48 8d 35 9a 5e 00 00 	lea    0x5e9a(%rip),%rsi        # 75c9 <ntjmptest>
    172f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1735:	4c 89 f7             	mov    %r14,%rdi
    1738:	e8 03 93 00 00       	callq  aa40 <measureFunction>
    173d:	bf 01 00 00 00       	mov    $0x1,%edi
    1742:	b8 01 00 00 00       	mov    $0x1,%eax
    1747:	48 8d 35 9a 9e 00 00 	lea    0x9e9a(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>
    174e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1752:	e8 29 f9 ff ff       	callq  1080 <__printf_chk@plt>
    1757:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    175b:	ba 04 00 00 00       	mov    $0x4,%edx
    1760:	48 8d 35 b9 9a 00 00 	lea    0x9ab9(%rip),%rsi        # b220 <_IO_stdin_used+0x220>
    1767:	4c 89 ef             	mov    %r13,%rdi
    176a:	e8 c1 f8 ff ff       	callq  1030 <strncmp@plt>
    176f:	85 c0                	test   %eax,%eax
    1771:	0f 85 f6 13 00 00    	jne    2b6d <main+0x1aad>
    1777:	48 8d 35 b2 8b 00 00 	lea    0x8bb2(%rip),%rsi        # a330 <pdeptest>
    177e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1784:	4c 89 f7             	mov    %r14,%rdi
    1787:	e8 b4 92 00 00       	callq  aa40 <measureFunction>
    178c:	bf 01 00 00 00       	mov    $0x1,%edi
    1791:	b8 01 00 00 00       	mov    $0x1,%eax
    1796:	48 8d 35 37 9e 00 00 	lea    0x9e37(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>
    179d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    17a1:	e8 da f8 ff ff       	callq  1080 <__printf_chk@plt>
    17a6:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    17aa:	ba 04 00 00 00       	mov    $0x4,%edx
    17af:	48 8d 35 6f 9a 00 00 	lea    0x9a6f(%rip),%rsi        # b225 <_IO_stdin_used+0x225>
    17b6:	4c 89 ef             	mov    %r13,%rdi
    17b9:	e8 72 f8 ff ff       	callq  1030 <strncmp@plt>
    17be:	85 c0                	test   %eax,%eax
    17c0:	0f 85 c3 13 00 00    	jne    2b89 <main+0x1ac9>
    17c6:	48 8d 35 c6 8c 00 00 	lea    0x8cc6(%rip),%rsi        # a493 <pexttest>
    17cd:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    17d3:	4c 89 f7             	mov    %r14,%rdi
    17d6:	e8 65 92 00 00       	callq  aa40 <measureFunction>
    17db:	bf 01 00 00 00       	mov    $0x1,%edi
    17e0:	b8 01 00 00 00       	mov    $0x1,%eax
    17e5:	48 8d 35 d4 9d 00 00 	lea    0x9dd4(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>
    17ec:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    17f0:	e8 8b f8 ff ff       	callq  1080 <__printf_chk@plt>
    17f5:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    17f9:	ba 07 00 00 00       	mov    $0x7,%edx
    17fe:	48 8d 35 25 9a 00 00 	lea    0x9a25(%rip),%rsi        # b22a <_IO_stdin_used+0x22a>
    1805:	4c 89 ef             	mov    %r13,%rdi
    1808:	e8 23 f8 ff ff       	callq  1030 <strncmp@plt>
    180d:	85 c0                	test   %eax,%eax
    180f:	0f 85 90 13 00 00    	jne    2ba5 <main+0x1ae5>
    1815:	48 8d 35 c8 8b 00 00 	lea    0x8bc8(%rip),%rsi        # a3e4 <pdepmultest>
    181c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1822:	4c 89 f7             	mov    %r14,%rdi
    1825:	e8 16 92 00 00       	callq  aa40 <measureFunction>
    182a:	bf 01 00 00 00       	mov    $0x1,%edi
    182f:	b8 01 00 00 00       	mov    $0x1,%eax
    1834:	48 8d 35 69 9d 00 00 	lea    0x9d69(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>
    183b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    183f:	e8 3c f8 ff ff       	callq  1080 <__printf_chk@plt>
    1844:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1848:	ba 03 00 00 00       	mov    $0x3,%edx
    184d:	48 8d 35 e8 99 00 00 	lea    0x99e8(%rip),%rsi        # b23c <_IO_stdin_used+0x23c>
    1854:	4c 89 ef             	mov    %r13,%rdi
    1857:	e8 d4 f7 ff ff       	callq  1030 <strncmp@plt>
    185c:	85 c0                	test   %eax,%eax
    185e:	0f 85 5d 13 00 00    	jne    2bc1 <main+0x1b01>
    1864:	48 8d 35 5b 57 00 00 	lea    0x575b(%rip),%rsi        # 6fc6 <shltest>
    186b:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1871:	4c 89 f7             	mov    %r14,%rdi
    1874:	e8 c7 91 00 00       	callq  aa40 <measureFunction>
    1879:	bf 01 00 00 00       	mov    $0x1,%edi
    187e:	b8 01 00 00 00       	mov    $0x1,%eax
    1883:	48 8d 35 03 9d 00 00 	lea    0x9d03(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>
    188a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    188e:	e8 ed f7 ff ff       	callq  1080 <__printf_chk@plt>
    1893:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1897:	ba 03 00 00 00       	mov    $0x3,%edx
    189c:	48 8d 35 8f 99 00 00 	lea    0x998f(%rip),%rsi        # b232 <_IO_stdin_used+0x232>
    18a3:	4c 89 ef             	mov    %r13,%rdi
    18a6:	e8 85 f7 ff ff       	callq  1030 <strncmp@plt>
    18ab:	85 c0                	test   %eax,%eax
    18ad:	0f 85 2a 13 00 00    	jne    2bdd <main+0x1b1d>
    18b3:	48 8d 35 80 56 00 00 	lea    0x5680(%rip),%rsi        # 6f3a <rortest>
    18ba:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    18c0:	4c 89 f7             	mov    %r14,%rdi
    18c3:	e8 78 91 00 00       	callq  aa40 <measureFunction>
    18c8:	bf 01 00 00 00       	mov    $0x1,%edi
    18cd:	b8 01 00 00 00       	mov    $0x1,%eax
    18d2:	48 8d 35 9d 9c 00 00 	lea    0x9c9d(%rip),%rsi        # b576 <_IO_stdin_used+0x576>
    18d9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    18dd:	e8 9e f7 ff ff       	callq  1080 <__printf_chk@plt>
    18e2:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    18e6:	ba 09 00 00 00       	mov    $0x9,%edx
    18eb:	48 8d 35 44 99 00 00 	lea    0x9944(%rip),%rsi        # b236 <_IO_stdin_used+0x236>
    18f2:	4c 89 ef             	mov    %r13,%rdi
    18f5:	e8 36 f7 ff ff       	callq  1030 <strncmp@plt>
    18fa:	85 c0                	test   %eax,%eax
    18fc:	0f 85 f7 12 00 00    	jne    2bf9 <main+0x1b39>
    1902:	48 8d 35 49 57 00 00 	lea    0x5749(%rip),%rsi        # 7052 <mixrorshltest>
    1909:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    190f:	4c 89 f7             	mov    %r14,%rdi
    1912:	e8 29 91 00 00       	callq  aa40 <measureFunction>
    1917:	bf 01 00 00 00       	mov    $0x1,%edi
    191c:	b8 01 00 00 00       	mov    $0x1,%eax
    1921:	48 8d 35 90 a6 00 00 	lea    0xa690(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>
    1928:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    192c:	e8 4f f7 ff ff       	callq  1080 <__printf_chk@plt>
    1931:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1935:	ba 03 00 00 00       	mov    $0x3,%edx
    193a:	48 8d 35 ff 98 00 00 	lea    0x98ff(%rip),%rsi        # b240 <_IO_stdin_used+0x240>
    1941:	4c 89 ef             	mov    %r13,%rdi
    1944:	e8 e7 f6 ff ff       	callq  1030 <strncmp@plt>
    1949:	85 c0                	test   %eax,%eax
    194b:	0f 85 c4 12 00 00    	jne    2c15 <main+0x1b55>
    1951:	48 8d 35 86 57 00 00 	lea    0x5786(%rip),%rsi        # 70de <mixrormultest>
    1958:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    195e:	4c 89 f7             	mov    %r14,%rdi
    1961:	e8 da 90 00 00       	callq  aa40 <measureFunction>
    1966:	bf 01 00 00 00       	mov    $0x1,%edi
    196b:	b8 01 00 00 00       	mov    $0x1,%eax
    1970:	48 8d 35 e4 9b 00 00 	lea    0x9be4(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>
    1977:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    197b:	e8 00 f7 ff ff       	callq  1080 <__printf_chk@plt>
    1980:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1984:	ba 03 00 00 00       	mov    $0x3,%edx
    1989:	48 8d 35 c0 98 00 00 	lea    0x98c0(%rip),%rsi        # b250 <_IO_stdin_used+0x250>
    1990:	4c 89 ef             	mov    %r13,%rdi
    1993:	e8 98 f6 ff ff       	callq  1030 <strncmp@plt>
    1998:	85 c0                	test   %eax,%eax
    199a:	0f 85 91 12 00 00    	jne    2c31 <main+0x1b71>
    19a0:	48 8d 35 89 58 00 00 	lea    0x5889(%rip),%rsi        # 7230 <btstest>
    19a7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    19ad:	4c 89 f7             	mov    %r14,%rdi
    19b0:	e8 8b 90 00 00       	callq  aa40 <measureFunction>
    19b5:	bf 01 00 00 00       	mov    $0x1,%edi
    19ba:	b8 01 00 00 00       	mov    $0x1,%eax
    19bf:	48 8d 35 82 9b 00 00 	lea    0x9b82(%rip),%rsi        # b548 <_IO_stdin_used+0x548>
    19c6:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    19ca:	e8 b1 f6 ff ff       	callq  1080 <__printf_chk@plt>
    19cf:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    19d3:	ba 09 00 00 00       	mov    $0x9,%edx
    19d8:	48 8d 35 6b 98 00 00 	lea    0x986b(%rip),%rsi        # b24a <_IO_stdin_used+0x24a>
    19df:	4c 89 ef             	mov    %r13,%rdi
    19e2:	e8 49 f6 ff ff       	callq  1030 <strncmp@plt>
    19e7:	85 c0                	test   %eax,%eax
    19e9:	0f 85 5e 12 00 00    	jne    2c4d <main+0x1b8d>
    19ef:	48 8d 35 33 5a 00 00 	lea    0x5a33(%rip),%rsi        # 7429 <btsmultest>
    19f6:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    19fc:	4c 89 f7             	mov    %r14,%rdi
    19ff:	e8 3c 90 00 00       	callq  aa40 <measureFunction>
    1a04:	bf 01 00 00 00       	mov    $0x1,%edi
    1a09:	b8 01 00 00 00       	mov    $0x1,%eax
    1a0e:	48 8d 35 18 9b 00 00 	lea    0x9b18(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>
    1a15:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1a19:	e8 62 f6 ff ff       	callq  1080 <__printf_chk@plt>
    1a1e:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1a22:	ba 09 00 00 00       	mov    $0x9,%edx
    1a27:	48 8d 35 26 98 00 00 	lea    0x9826(%rip),%rsi        # b254 <_IO_stdin_used+0x254>
    1a2e:	4c 89 ef             	mov    %r13,%rdi
    1a31:	e8 fa f5 ff ff       	callq  1030 <strncmp@plt>
    1a36:	85 c0                	test   %eax,%eax
    1a38:	0f 85 2b 12 00 00    	jne    2c69 <main+0x1ba9>
    1a3e:	48 8d 35 4e 57 00 00 	lea    0x574e(%rip),%rsi        # 7193 <rorbtstest>
    1a45:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1a4b:	4c 89 f7             	mov    %r14,%rdi
    1a4e:	e8 ed 8f 00 00       	callq  aa40 <measureFunction>
    1a53:	bf 01 00 00 00       	mov    $0x1,%edi
    1a58:	b8 01 00 00 00       	mov    $0x1,%eax
    1a5d:	48 8d 35 ae 9a 00 00 	lea    0x9aae(%rip),%rsi        # b512 <_IO_stdin_used+0x512>
    1a64:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1a68:	e8 13 f6 ff ff       	callq  1080 <__printf_chk@plt>
    1a6d:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1a71:	ba 03 00 00 00       	mov    $0x3,%edx
    1a76:	48 8d 35 e7 97 00 00 	lea    0x97e7(%rip),%rsi        # b264 <_IO_stdin_used+0x264>
    1a7d:	4c 89 ef             	mov    %r13,%rdi
    1a80:	e8 ab f5 ff ff       	callq  1030 <strncmp@plt>
    1a85:	85 c0                	test   %eax,%eax
    1a87:	0f 85 f8 11 00 00    	jne    2c85 <main+0x1bc5>
    1a8d:	48 8d 35 3f 58 00 00 	lea    0x583f(%rip),%rsi        # 72d3 <leatest>
    1a94:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1a9a:	4c 89 f7             	mov    %r14,%rdi
    1a9d:	e8 9e 8f 00 00       	callq  aa40 <measureFunction>
    1aa2:	bf 01 00 00 00       	mov    $0x1,%edi
    1aa7:	b8 01 00 00 00       	mov    $0x1,%eax
    1aac:	48 8d 35 46 9a 00 00 	lea    0x9a46(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>
    1ab3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1ab7:	e8 c4 f5 ff ff       	callq  1080 <__printf_chk@plt>
    1abc:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1ac0:	ba 09 00 00 00       	mov    $0x9,%edx
    1ac5:	48 8d 35 92 97 00 00 	lea    0x9792(%rip),%rsi        # b25e <_IO_stdin_used+0x25e>
    1acc:	4c 89 ef             	mov    %r13,%rdi
    1acf:	e8 5c f5 ff ff       	callq  1030 <strncmp@plt>
    1ad4:	85 c0                	test   %eax,%eax
    1ad6:	0f 85 c5 11 00 00    	jne    2ca1 <main+0x1be1>
    1adc:	48 8d 35 93 58 00 00 	lea    0x5893(%rip),%rsi        # 7376 <leamultest>
    1ae3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1ae9:	4c 89 f7             	mov    %r14,%rdi
    1aec:	e8 4f 8f 00 00       	callq  aa40 <measureFunction>
    1af1:	bf 01 00 00 00       	mov    $0x1,%edi
    1af6:	b8 01 00 00 00       	mov    $0x1,%eax
    1afb:	48 8d 35 8e a4 00 00 	lea    0xa48e(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>
    1b02:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1b06:	e8 75 f5 ff ff       	callq  1080 <__printf_chk@plt>
    1b0b:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1b0f:	ba 09 00 00 00       	mov    $0x9,%edx
    1b14:	48 8d 35 5f 97 00 00 	lea    0x975f(%rip),%rsi        # b27a <_IO_stdin_used+0x27a>
    1b1b:	4c 89 ef             	mov    %r13,%rdi
    1b1e:	e8 0d f5 ff ff       	callq  1030 <strncmp@plt>
    1b23:	85 c0                	test   %eax,%eax
    1b25:	0f 85 92 11 00 00    	jne    2cbd <main+0x1bfd>
    1b2b:	48 8d 35 59 5d 00 00 	lea    0x5d59(%rip),%rsi        # 788b <add256int>
    1b32:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1b38:	4c 89 f7             	mov    %r14,%rdi
    1b3b:	e8 00 8f 00 00       	callq  aa40 <measureFunction>
    1b40:	bf 01 00 00 00       	mov    $0x1,%edi
    1b45:	b8 01 00 00 00       	mov    $0x1,%eax
    1b4a:	48 8d 35 17 a4 00 00 	lea    0xa417(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>
    1b51:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1b55:	e8 26 f5 ff ff       	callq  1080 <__printf_chk@plt>
    1b5a:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1b5e:	ba 0c 00 00 00       	mov    $0xc,%edx
    1b63:	48 8d 35 fe 96 00 00 	lea    0x96fe(%rip),%rsi        # b268 <_IO_stdin_used+0x268>
    1b6a:	4c 89 ef             	mov    %r13,%rdi
    1b6d:	e8 be f4 ff ff       	callq  1030 <strncmp@plt>
    1b72:	85 c0                	test   %eax,%eax
    1b74:	0f 85 5f 11 00 00    	jne    2cd9 <main+0x1c19>
    1b7a:	48 8d 35 c3 61 00 00 	lea    0x61c3(%rip),%rsi        # 7d44 <mixadd256int>
    1b81:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1b87:	4c 89 f7             	mov    %r14,%rdi
    1b8a:	e8 b1 8e 00 00       	callq  aa40 <measureFunction>
    1b8f:	bf 01 00 00 00       	mov    $0x1,%edi
    1b94:	b8 01 00 00 00       	mov    $0x1,%eax
    1b99:	48 8d 35 90 a3 00 00 	lea    0xa390(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>
    1ba0:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1ba4:	e8 d7 f4 ff ff       	callq  1080 <__printf_chk@plt>
    1ba9:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1bad:	ba 0e 00 00 00       	mov    $0xe,%edx
    1bb2:	48 8d 35 bc 96 00 00 	lea    0x96bc(%rip),%rsi        # b275 <_IO_stdin_used+0x275>
    1bb9:	4c 89 ef             	mov    %r13,%rdi
    1bbc:	e8 6f f4 ff ff       	callq  1030 <strncmp@plt>
    1bc1:	85 c0                	test   %eax,%eax
    1bc3:	0f 85 2c 11 00 00    	jne    2cf5 <main+0x1c35>
    1bc9:	48 8d 35 39 62 00 00 	lea    0x6239(%rip),%rsi        # 7e09 <mixadd256int11>
    1bd0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1bd6:	4c 89 f7             	mov    %r14,%rdi
    1bd9:	e8 62 8e 00 00       	callq  aa40 <measureFunction>
    1bde:	bf 01 00 00 00       	mov    $0x1,%edi
    1be3:	b8 01 00 00 00       	mov    $0x1,%eax
    1be8:	48 8d 35 09 a3 00 00 	lea    0xa309(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>
    1bef:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1bf3:	e8 88 f4 ff ff       	callq  1080 <__printf_chk@plt>
    1bf8:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1bfc:	ba 0e 00 00 00       	mov    $0xe,%edx
    1c01:	48 8d 35 7c 96 00 00 	lea    0x967c(%rip),%rsi        # b284 <_IO_stdin_used+0x284>
    1c08:	4c 89 ef             	mov    %r13,%rdi
    1c0b:	e8 20 f4 ff ff       	callq  1030 <strncmp@plt>
    1c10:	85 c0                	test   %eax,%eax
    1c12:	0f 85 f9 10 00 00    	jne    2d11 <main+0x1c51>
    1c18:	48 8d 35 2c 5f 00 00 	lea    0x5f2c(%rip),%rsi        # 7b4b <mixadd256fpint>
    1c1f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1c25:	4c 89 f7             	mov    %r14,%rdi
    1c28:	e8 13 8e 00 00       	callq  aa40 <measureFunction>
    1c2d:	bf 01 00 00 00       	mov    $0x1,%edi
    1c32:	b8 01 00 00 00       	mov    $0x1,%eax
    1c37:	48 8d 35 8a a2 00 00 	lea    0xa28a(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>
    1c3e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1c42:	e8 39 f4 ff ff       	callq  1080 <__printf_chk@plt>
    1c47:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1c4b:	ba 08 00 00 00       	mov    $0x8,%edx
    1c50:	48 8d 35 3c 96 00 00 	lea    0x963c(%rip),%rsi        # b293 <_IO_stdin_used+0x293>
    1c57:	4c 89 ef             	mov    %r13,%rdi
    1c5a:	e8 d1 f3 ff ff       	callq  1030 <strncmp@plt>
    1c5f:	85 c0                	test   %eax,%eax
    1c61:	0f 85 c6 10 00 00    	jne    2d2d <main+0x1c6d>
    1c67:	48 8d 35 32 60 00 00 	lea    0x6032(%rip),%rsi        # 7ca0 <mix256fp>
    1c6e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1c74:	4c 89 f7             	mov    %r14,%rdi
    1c77:	e8 c4 8d 00 00       	callq  aa40 <measureFunction>
    1c7c:	bf 01 00 00 00       	mov    $0x1,%edi
    1c81:	b8 01 00 00 00       	mov    $0x1,%eax
    1c86:	48 8d 35 0b a2 00 00 	lea    0xa20b(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>
    1c8d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1c91:	e8 ea f3 ff ff       	callq  1080 <__printf_chk@plt>
    1c96:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1c9a:	ba 0c 00 00 00       	mov    $0xc,%edx
    1c9f:	48 8d 35 f6 95 00 00 	lea    0x95f6(%rip),%rsi        # b29c <_IO_stdin_used+0x29c>
    1ca6:	4c 89 ef             	mov    %r13,%rdi
    1ca9:	e8 82 f3 ff ff       	callq  1030 <strncmp@plt>
    1cae:	85 c0                	test   %eax,%eax
    1cb0:	0f 85 93 10 00 00    	jne    2d49 <main+0x1c89>
    1cb6:	48 8d 35 f3 61 00 00 	lea    0x61f3(%rip),%rsi        # 7eb0 <latadd256int>
    1cbd:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1cc3:	4c 89 f7             	mov    %r14,%rdi
    1cc6:	e8 75 8d 00 00       	callq  aa40 <measureFunction>
    1ccb:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    1cd1:	bf 01 00 00 00       	mov    $0x1,%edi
    1cd6:	48 8d 35 8b a1 00 00 	lea    0xa18b(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>
    1cdd:	b8 01 00 00 00       	mov    $0x1,%eax
    1ce2:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    1ce6:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1cea:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    1cee:	e8 8d f3 ff ff       	callq  1080 <__printf_chk@plt>
    1cf3:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1cf7:	ba 0c 00 00 00       	mov    $0xc,%edx
    1cfc:	48 8d 35 a6 95 00 00 	lea    0x95a6(%rip),%rsi        # b2a9 <_IO_stdin_used+0x2a9>
    1d03:	4c 89 ef             	mov    %r13,%rdi
    1d06:	e8 25 f3 ff ff       	callq  1030 <strncmp@plt>
    1d0b:	85 c0                	test   %eax,%eax
    1d0d:	0f 85 52 10 00 00    	jne    2d65 <main+0x1ca5>
    1d13:	48 8d 35 81 65 00 00 	lea    0x6581(%rip),%rsi        # 829b <latmul256int>
    1d1a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1d20:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1d25:	e8 16 8d 00 00       	callq  aa40 <measureFunction>
    1d2a:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    1d30:	bf 01 00 00 00       	mov    $0x1,%edi
    1d35:	48 8d 35 f4 a0 00 00 	lea    0xa0f4(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>
    1d3c:	b8 01 00 00 00       	mov    $0x1,%eax
    1d41:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    1d45:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1d49:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    1d4d:	e8 2e f3 ff ff       	callq  1080 <__printf_chk@plt>
    1d52:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1d56:	ba 0c 00 00 00       	mov    $0xc,%edx
    1d5b:	48 8d 35 54 95 00 00 	lea    0x9554(%rip),%rsi        # b2b6 <_IO_stdin_used+0x2b6>
    1d62:	4c 89 ef             	mov    %r13,%rdi
    1d65:	e8 c6 f2 ff ff       	callq  1030 <strncmp@plt>
    1d6a:	85 c0                	test   %eax,%eax
    1d6c:	0f 85 0f 10 00 00    	jne    2d81 <main+0x1cc1>
    1d72:	48 8d 35 d0 65 00 00 	lea    0x65d0(%rip),%rsi        # 8349 <latadd128int>
    1d79:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1d7f:	4c 89 f7             	mov    %r14,%rdi
    1d82:	e8 b9 8c 00 00       	callq  aa40 <measureFunction>
    1d87:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    1d8d:	bf 01 00 00 00       	mov    $0x1,%edi
    1d92:	48 8d 35 67 a0 00 00 	lea    0xa067(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>
    1d99:	b8 01 00 00 00       	mov    $0x1,%eax
    1d9e:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    1da2:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1da6:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    1daa:	e8 d1 f2 ff ff       	callq  1080 <__printf_chk@plt>
    1daf:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1db3:	ba 0c 00 00 00       	mov    $0xc,%edx
    1db8:	48 8d 35 04 95 00 00 	lea    0x9504(%rip),%rsi        # b2c3 <_IO_stdin_used+0x2c3>
    1dbf:	4c 89 ef             	mov    %r13,%rdi
    1dc2:	e8 69 f2 ff ff       	callq  1030 <strncmp@plt>
    1dc7:	85 c0                	test   %eax,%eax
    1dc9:	0f 85 ce 0f 00 00    	jne    2d9d <main+0x1cdd>
    1dcf:	48 8d 35 c6 6a 00 00 	lea    0x6ac6(%rip),%rsi        # 889c <latmul128int>
    1dd6:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1ddc:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1de1:	e8 5a 8c 00 00       	callq  aa40 <measureFunction>
    1de6:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    1dec:	bf 01 00 00 00       	mov    $0x1,%edi
    1df1:	48 8d 35 d0 9f 00 00 	lea    0x9fd0(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>
    1df8:	b8 01 00 00 00       	mov    $0x1,%eax
    1dfd:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    1e01:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1e05:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    1e09:	e8 72 f2 ff ff       	callq  1080 <__printf_chk@plt>
    1e0e:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1e12:	ba 0b 00 00 00       	mov    $0xb,%edx
    1e17:	48 8d 35 b2 94 00 00 	lea    0x94b2(%rip),%rsi        # b2d0 <_IO_stdin_used+0x2d0>
    1e1e:	4c 89 ef             	mov    %r13,%rdi
    1e21:	e8 0a f2 ff ff       	callq  1030 <strncmp@plt>
    1e26:	85 c0                	test   %eax,%eax
    1e28:	0f 85 8b 0f 00 00    	jne    2db9 <main+0x1cf9>
    1e2e:	48 8d 35 8c 6b 00 00 	lea    0x6b8c(%rip),%rsi        # 89c1 <latadd256fp>
    1e35:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1e3b:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1e40:	e8 fb 8b 00 00       	callq  aa40 <measureFunction>
    1e45:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    1e4b:	bf 01 00 00 00       	mov    $0x1,%edi
    1e50:	48 8d 35 49 9f 00 00 	lea    0x9f49(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>
    1e57:	b8 01 00 00 00       	mov    $0x1,%eax
    1e5c:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    1e60:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1e64:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    1e68:	e8 13 f2 ff ff       	callq  1080 <__printf_chk@plt>
    1e6d:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1e71:	ba 0b 00 00 00       	mov    $0xb,%edx
    1e76:	48 8d 35 5f 94 00 00 	lea    0x945f(%rip),%rsi        # b2dc <_IO_stdin_used+0x2dc>
    1e7d:	4c 89 ef             	mov    %r13,%rdi
    1e80:	e8 ab f1 ff ff       	callq  1030 <strncmp@plt>
    1e85:	85 c0                	test   %eax,%eax
    1e87:	0f 85 48 0f 00 00    	jne    2dd5 <main+0x1d15>
    1e8d:	48 8d 35 e7 6c 00 00 	lea    0x6ce7(%rip),%rsi        # 8b7b <latmul256fp>
    1e94:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1e9a:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1e9f:	e8 9c 8b 00 00       	callq  aa40 <measureFunction>
    1ea4:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    1eaa:	bf 01 00 00 00       	mov    $0x1,%edi
    1eaf:	48 8d 35 c2 9e 00 00 	lea    0x9ec2(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>
    1eb6:	b8 01 00 00 00       	mov    $0x1,%eax
    1ebb:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    1ebf:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1ec3:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    1ec7:	e8 b4 f1 ff ff       	callq  1080 <__printf_chk@plt>
    1ecc:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1ed0:	ba 0b 00 00 00       	mov    $0xb,%edx
    1ed5:	48 8d 35 0c 94 00 00 	lea    0x940c(%rip),%rsi        # b2e8 <_IO_stdin_used+0x2e8>
    1edc:	4c 89 ef             	mov    %r13,%rdi
    1edf:	e8 4c f1 ff ff       	callq  1030 <strncmp@plt>
    1ee4:	85 c0                	test   %eax,%eax
    1ee6:	0f 85 05 0f 00 00    	jne    2df1 <main+0x1d31>
    1eec:	48 8d 35 99 79 00 00 	lea    0x7999(%rip),%rsi        # 988c <latadd128fp>
    1ef3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1ef9:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1efe:	e8 3d 8b 00 00       	callq  aa40 <measureFunction>
    1f03:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    1f09:	bf 01 00 00 00       	mov    $0x1,%edi
    1f0e:	48 8d 35 3b 9e 00 00 	lea    0x9e3b(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>
    1f15:	b8 01 00 00 00       	mov    $0x1,%eax
    1f1a:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    1f1e:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1f22:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    1f26:	e8 55 f1 ff ff       	callq  1080 <__printf_chk@plt>
    1f2b:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1f2f:	ba 0b 00 00 00       	mov    $0xb,%edx
    1f34:	48 8d 35 b9 93 00 00 	lea    0x93b9(%rip),%rsi        # b2f4 <_IO_stdin_used+0x2f4>
    1f3b:	4c 89 ef             	mov    %r13,%rdi
    1f3e:	e8 ed f0 ff ff       	callq  1030 <strncmp@plt>
    1f43:	85 c0                	test   %eax,%eax
    1f45:	0f 85 c2 0e 00 00    	jne    2e0d <main+0x1d4d>
    1f4b:	48 8d 35 9a 79 00 00 	lea    0x799a(%rip),%rsi        # 98ec <latmul128fp>
    1f52:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1f58:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    1f5d:	e8 de 8a 00 00       	callq  aa40 <measureFunction>
    1f62:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    1f68:	bf 01 00 00 00       	mov    $0x1,%edi
    1f6d:	48 8d 35 b4 9d 00 00 	lea    0x9db4(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>
    1f74:	b8 01 00 00 00       	mov    $0x1,%eax
    1f79:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    1f7d:	66 0f ef c0          	pxor   %xmm0,%xmm0
    1f81:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    1f85:	e8 f6 f0 ff ff       	callq  1080 <__printf_chk@plt>
    1f8a:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1f8e:	ba 08 00 00 00       	mov    $0x8,%edx
    1f93:	48 8d 35 51 93 00 00 	lea    0x9351(%rip),%rsi        # b2eb <_IO_stdin_used+0x2eb>
    1f9a:	4c 89 ef             	mov    %r13,%rdi
    1f9d:	e8 8e f0 ff ff       	callq  1030 <strncmp@plt>
    1fa2:	85 c0                	test   %eax,%eax
    1fa4:	0f 85 7f 0e 00 00    	jne    2e29 <main+0x1d69>
    1faa:	48 8d 35 0a 7a 00 00 	lea    0x7a0a(%rip),%rsi        # 99bb <add128fp>
    1fb1:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    1fb7:	4c 89 f7             	mov    %r14,%rdi
    1fba:	e8 81 8a 00 00       	callq  aa40 <measureFunction>
    1fbf:	bf 01 00 00 00       	mov    $0x1,%edi
    1fc4:	b8 01 00 00 00       	mov    $0x1,%eax
    1fc9:	48 8d 35 38 9d 00 00 	lea    0x9d38(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>
    1fd0:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    1fd4:	e8 a7 f0 ff ff       	callq  1080 <__printf_chk@plt>
    1fd9:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    1fdd:	ba 08 00 00 00       	mov    $0x8,%edx
    1fe2:	48 8d 35 0e 93 00 00 	lea    0x930e(%rip),%rsi        # b2f7 <_IO_stdin_used+0x2f7>
    1fe9:	4c 89 ef             	mov    %r13,%rdi
    1fec:	e8 3f f0 ff ff       	callq  1030 <strncmp@plt>
    1ff1:	85 c0                	test   %eax,%eax
    1ff3:	0f 85 4c 0e 00 00    	jne    2e45 <main+0x1d85>
    1ff9:	48 8d 35 4c 79 00 00 	lea    0x794c(%rip),%rsi        # 994c <mul128fp>
    2000:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2006:	4c 89 f7             	mov    %r14,%rdi
    2009:	e8 32 8a 00 00       	callq  aa40 <measureFunction>
    200e:	bf 01 00 00 00       	mov    $0x1,%edi
    2013:	b8 01 00 00 00       	mov    $0x1,%eax
    2018:	48 8d 35 c9 9c 00 00 	lea    0x9cc9(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>
    201f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2023:	e8 58 f0 ff ff       	callq  1080 <__printf_chk@plt>
    2028:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    202c:	ba 09 00 00 00       	mov    $0x9,%edx
    2031:	48 8d 35 81 92 00 00 	lea    0x9281(%rip),%rsi        # b2b9 <_IO_stdin_used+0x2b9>
    2038:	4c 89 ef             	mov    %r13,%rdi
    203b:	e8 f0 ef ff ff       	callq  1030 <strncmp@plt>
    2040:	85 c0                	test   %eax,%eax
    2042:	0f 85 19 0e 00 00    	jne    2e61 <main+0x1da1>
    2048:	48 8d 35 65 63 00 00 	lea    0x6365(%rip),%rsi        # 83b4 <add128int>
    204f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2055:	4c 89 f7             	mov    %r14,%rdi
    2058:	e8 e3 89 00 00       	callq  aa40 <measureFunction>
    205d:	bf 01 00 00 00       	mov    $0x1,%edi
    2062:	b8 01 00 00 00       	mov    $0x1,%eax
    2067:	48 8d 35 52 9c 00 00 	lea    0x9c52(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>
    206e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2072:	e8 09 f0 ff ff       	callq  1080 <__printf_chk@plt>
    2077:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    207b:	ba 09 00 00 00       	mov    $0x9,%edx
    2080:	48 8d 35 4e 93 00 00 	lea    0x934e(%rip),%rsi        # b3d5 <_IO_stdin_used+0x3d5>
    2087:	4c 89 ef             	mov    %r13,%rdi
    208a:	e8 a1 ef ff ff       	callq  1030 <strncmp@plt>
    208f:	85 c0                	test   %eax,%eax
    2091:	0f 85 e6 0d 00 00    	jne    2e7d <main+0x1dbd>
    2097:	48 8d 35 7f 67 00 00 	lea    0x677f(%rip),%rsi        # 881d <mul128int>
    209e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    20a4:	4c 89 f7             	mov    %r14,%rdi
    20a7:	e8 94 89 00 00       	callq  aa40 <measureFunction>
    20ac:	bf 01 00 00 00       	mov    $0x1,%edi
    20b1:	b8 01 00 00 00       	mov    $0x1,%eax
    20b6:	48 8d 35 db 9b 00 00 	lea    0x9bdb(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>
    20bd:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    20c1:	e8 ba ef ff ff       	callq  1080 <__printf_chk@plt>
    20c6:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    20ca:	ba 06 00 00 00       	mov    $0x6,%edx
    20cf:	48 8d 35 2d 92 00 00 	lea    0x922d(%rip),%rsi        # b303 <_IO_stdin_used+0x303>
    20d6:	4c 89 ef             	mov    %r13,%rdi
    20d9:	e8 52 ef ff ff       	callq  1030 <strncmp@plt>
    20de:	85 c0                	test   %eax,%eax
    20e0:	0f 85 b3 0d 00 00    	jne    2e99 <main+0x1dd9>
    20e6:	48 8d 35 c2 6c 00 00 	lea    0x6cc2(%rip),%rsi        # 8daf <fma256>
    20ed:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    20f3:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    20f8:	e8 43 89 00 00       	callq  aa40 <measureFunction>
    20fd:	bf 01 00 00 00       	mov    $0x1,%edi
    2102:	b8 01 00 00 00       	mov    $0x1,%eax
    2107:	48 8d 35 d0 93 00 00 	lea    0x93d0(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>
    210e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2112:	e8 69 ef ff ff       	callq  1080 <__printf_chk@plt>
    2117:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    211b:	ba 06 00 00 00       	mov    $0x6,%edx
    2120:	48 8d 35 8a 90 00 00 	lea    0x908a(%rip),%rsi        # b1b1 <_IO_stdin_used+0x1b1>
    2127:	4c 89 ef             	mov    %r13,%rdi
    212a:	e8 01 ef ff ff       	callq  1030 <strncmp@plt>
    212f:	85 c0                	test   %eax,%eax
    2131:	0f 85 7e 0d 00 00    	jne    2eb5 <main+0x1df5>
    2137:	48 8d 35 29 6d 00 00 	lea    0x6d29(%rip),%rsi        # 8e67 <fma128>
    213e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2144:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2149:	e8 f2 88 00 00       	callq  aa40 <measureFunction>
    214e:	bf 01 00 00 00       	mov    $0x1,%edi
    2153:	b8 01 00 00 00       	mov    $0x1,%eax
    2158:	48 8d 35 64 93 00 00 	lea    0x9364(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>
    215f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2163:	e8 18 ef ff ff       	callq  1080 <__printf_chk@plt>
    2168:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    216c:	ba 09 00 00 00       	mov    $0x9,%edx
    2171:	48 8d 35 88 91 00 00 	lea    0x9188(%rip),%rsi        # b300 <_IO_stdin_used+0x300>
    2178:	4c 89 ef             	mov    %r13,%rdi
    217b:	e8 b0 ee ff ff       	callq  1030 <strncmp@plt>
    2180:	85 c0                	test   %eax,%eax
    2182:	0f 85 49 0d 00 00    	jne    2ed1 <main+0x1e11>
    2188:	48 8d 35 8a 75 00 00 	lea    0x758a(%rip),%rsi        # 9719 <latfma256>
    218f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2195:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    219a:	e8 a1 88 00 00       	callq  aa40 <measureFunction>
    219f:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    21a5:	bf 01 00 00 00       	mov    $0x1,%edi
    21aa:	48 8d 35 bf 9a 00 00 	lea    0x9abf(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>
    21b1:	b8 01 00 00 00       	mov    $0x1,%eax
    21b6:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    21ba:	66 0f ef c0          	pxor   %xmm0,%xmm0
    21be:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    21c2:	e8 b9 ee ff ff       	callq  1080 <__printf_chk@plt>
    21c7:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    21cb:	ba 09 00 00 00       	mov    $0x9,%edx
    21d0:	48 8d 35 33 91 00 00 	lea    0x9133(%rip),%rsi        # b30a <_IO_stdin_used+0x30a>
    21d7:	4c 89 ef             	mov    %r13,%rdi
    21da:	e8 51 ee ff ff       	callq  1030 <strncmp@plt>
    21df:	85 c0                	test   %eax,%eax
    21e1:	0f 85 06 0d 00 00    	jne    2eed <main+0x1e2d>
    21e7:	48 8d 35 e3 75 00 00 	lea    0x75e3(%rip),%rsi        # 97d1 <latfma128>
    21ee:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    21f4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    21f9:	e8 42 88 00 00       	callq  aa40 <measureFunction>
    21fe:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    2204:	bf 01 00 00 00       	mov    $0x1,%edi
    2209:	48 8d 35 38 9a 00 00 	lea    0x9a38(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>
    2210:	b8 01 00 00 00       	mov    $0x1,%eax
    2215:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    2219:	66 0f ef c0          	pxor   %xmm0,%xmm0
    221d:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    2221:	e8 5a ee ff ff       	callq  1080 <__printf_chk@plt>
    2226:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    222a:	ba 06 00 00 00       	mov    $0x6,%edx
    222f:	48 8d 35 ec 90 00 00 	lea    0x90ec(%rip),%rsi        # b322 <_IO_stdin_used+0x322>
    2236:	4c 89 ef             	mov    %r13,%rdi
    2239:	e8 f2 ed ff ff       	callq  1030 <strncmp@plt>
    223e:	85 c0                	test   %eax,%eax
    2240:	0f 85 c3 0c 00 00    	jne    2f09 <main+0x1e49>
    2246:	48 8d 35 8f 68 00 00 	lea    0x688f(%rip),%rsi        # 8adc <add256fp>
    224d:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2253:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2258:	e8 e3 87 00 00       	callq  aa40 <measureFunction>
    225d:	bf 01 00 00 00       	mov    $0x1,%edi
    2262:	b8 01 00 00 00       	mov    $0x1,%eax
    2267:	48 8d 35 39 92 00 00 	lea    0x9239(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>
    226e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2272:	e8 09 ee ff ff       	callq  1080 <__printf_chk@plt>
    2277:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    227b:	ba 06 00 00 00       	mov    $0x6,%edx
    2280:	48 8d 35 8d 90 00 00 	lea    0x908d(%rip),%rsi        # b314 <_IO_stdin_used+0x314>
    2287:	4c 89 ef             	mov    %r13,%rdi
    228a:	e8 a1 ed ff ff       	callq  1030 <strncmp@plt>
    228f:	85 c0                	test   %eax,%eax
    2291:	0f 85 8e 0c 00 00    	jne    2f25 <main+0x1e65>
    2297:	48 8d 35 9f 67 00 00 	lea    0x679f(%rip),%rsi        # 8a3d <mul256fp>
    229e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    22a4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    22a9:	e8 92 87 00 00       	callq  aa40 <measureFunction>
    22ae:	bf 01 00 00 00       	mov    $0x1,%edi
    22b3:	b8 01 00 00 00       	mov    $0x1,%eax
    22b8:	48 8d 35 cc 91 00 00 	lea    0x91cc(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>
    22bf:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    22c3:	e8 b8 ed ff ff       	callq  1080 <__printf_chk@plt>
    22c8:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    22cc:	ba 0c 00 00 00       	mov    $0xc,%edx
    22d1:	48 8d 35 44 90 00 00 	lea    0x9044(%rip),%rsi        # b31c <_IO_stdin_used+0x31c>
    22d8:	4c 89 ef             	mov    %r13,%rdi
    22db:	e8 50 ed ff ff       	callq  1030 <strncmp@plt>
    22e0:	85 c0                	test   %eax,%eax
    22e2:	0f 85 59 0c 00 00    	jne    2f41 <main+0x1e81>
    22e8:	48 8d 35 33 6c 00 00 	lea    0x6c33(%rip),%rsi        # 8f22 <mixfmafadd256>
    22ef:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    22f5:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    22fa:	48 c1 e7 09          	shl    $0x9,%rdi
    22fe:	e8 3d 87 00 00       	callq  aa40 <measureFunction>
    2303:	bf 01 00 00 00       	mov    $0x1,%edi
    2308:	b8 01 00 00 00       	mov    $0x1,%eax
    230d:	48 8d 35 0c 99 00 00 	lea    0x990c(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>
    2314:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2318:	e8 63 ed ff ff       	callq  1080 <__printf_chk@plt>
    231d:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2321:	ba 0b 00 00 00       	mov    $0xb,%edx
    2326:	48 8d 35 fd 8f 00 00 	lea    0x8ffd(%rip),%rsi        # b32a <_IO_stdin_used+0x32a>
    232d:	4c 89 ef             	mov    %r13,%rdi
    2330:	e8 fb ec ff ff       	callq  1030 <strncmp@plt>
    2335:	85 c0                	test   %eax,%eax
    2337:	0f 85 20 0c 00 00    	jne    2f5d <main+0x1e9d>
    233d:	48 8d 35 8d 6e 00 00 	lea    0x6e8d(%rip),%rsi        # 91d1 <mixfmaadd256>
    2344:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    234a:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    234f:	48 c1 e7 09          	shl    $0x9,%rdi
    2353:	e8 e8 86 00 00       	callq  aa40 <measureFunction>
    2358:	bf 01 00 00 00       	mov    $0x1,%edi
    235d:	b8 01 00 00 00       	mov    $0x1,%eax
    2362:	48 8d 35 8f 98 00 00 	lea    0x988f(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>
    2369:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    236d:	e8 0e ed ff ff       	callq  1080 <__printf_chk@plt>
    2372:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2376:	ba 0e 00 00 00       	mov    $0xe,%edx
    237b:	48 8d 35 b5 8f 00 00 	lea    0x8fb5(%rip),%rsi        # b337 <_IO_stdin_used+0x337>
    2382:	4c 89 ef             	mov    %r13,%rdi
    2385:	e8 a6 ec ff ff       	callq  1030 <strncmp@plt>
    238a:	85 c0                	test   %eax,%eax
    238c:	0f 85 e7 0b 00 00    	jne    2f79 <main+0x1eb9>
    2392:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2398:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    239d:	48 8d 35 8c 86 00 00 	lea    0x868c(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>
    23a4:	48 c1 e7 09          	shl    $0x9,%rdi
    23a8:	e8 93 86 00 00       	callq  aa40 <measureFunction>
    23ad:	bf 01 00 00 00       	mov    $0x1,%edi
    23b2:	b8 01 00 00 00       	mov    $0x1,%eax
    23b7:	48 8d 35 0a 98 00 00 	lea    0x980a(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>
    23be:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    23c2:	e8 b9 ec ff ff       	callq  1080 <__printf_chk@plt>
    23c7:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    23cb:	ba 0b 00 00 00       	mov    $0xb,%edx
    23d0:	48 8d 35 70 8f 00 00 	lea    0x8f70(%rip),%rsi        # b347 <_IO_stdin_used+0x347>
    23d7:	4c 89 ef             	mov    %r13,%rdi
    23da:	e8 51 ec ff ff       	callq  1030 <strncmp@plt>
    23df:	85 c0                	test   %eax,%eax
    23e1:	75 39                	jne    241c <main+0x135c>
    23e3:	48 8d 35 98 6e 00 00 	lea    0x6e98(%rip),%rsi        # 9282 <mixfmaand256>
    23ea:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    23f0:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    23f5:	48 c1 e7 09          	shl    $0x9,%rdi
    23f9:	e8 42 86 00 00       	callq  aa40 <measureFunction>
    23fe:	bf 01 00 00 00       	mov    $0x1,%edi
    2403:	b8 01 00 00 00       	mov    $0x1,%eax
    2408:	48 8d 35 91 97 00 00 	lea    0x9791(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>
    240f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2413:	e8 68 ec ff ff       	callq  1080 <__printf_chk@plt>
    2418:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    241c:	ba 0e 00 00 00       	mov    $0xe,%edx
    2421:	48 8d 35 0f 8f 00 00 	lea    0x8f0f(%rip),%rsi        # b337 <_IO_stdin_used+0x337>
    2428:	4c 89 ef             	mov    %r13,%rdi
    242b:	e8 00 ec ff ff       	callq  1030 <strncmp@plt>
    2430:	85 c0                	test   %eax,%eax
    2432:	0f 85 5d 0b 00 00    	jne    2f95 <main+0x1ed5>
    2438:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    243e:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    2443:	48 8d 35 d6 85 00 00 	lea    0x85d6(%rip),%rsi        # aa20 <mixfmaandmem256wrapper>
    244a:	48 c1 e7 09          	shl    $0x9,%rdi
    244e:	e8 ed 85 00 00       	callq  aa40 <measureFunction>
    2453:	bf 01 00 00 00       	mov    $0x1,%edi
    2458:	b8 01 00 00 00       	mov    $0x1,%eax
    245d:	48 8d 35 e4 9b 00 00 	lea    0x9be4(%rip),%rsi        # c048 <_IO_stdin_used+0x1048>
    2464:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2468:	e8 13 ec ff ff       	callq  1080 <__printf_chk@plt>
    246d:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2471:	ba 0d 00 00 00       	mov    $0xd,%edx
    2476:	48 8d 35 9a 8c 00 00 	lea    0x8c9a(%rip),%rsi        # b117 <_IO_stdin_used+0x117>
    247d:	4c 89 ef             	mov    %r13,%rdi
    2480:	e8 ab eb ff ff       	callq  1030 <strncmp@plt>
    2485:	85 c0                	test   %eax,%eax
    2487:	0f 85 24 0b 00 00    	jne    2fb1 <main+0x1ef1>
    248d:	48 8d 35 04 71 00 00 	lea    0x7104(%rip),%rsi        # 9598 <nemesfpumix21>
    2494:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    249a:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    249f:	48 c1 e7 09          	shl    $0x9,%rdi
    24a3:	e8 98 85 00 00       	callq  aa40 <measureFunction>
    24a8:	bf 01 00 00 00       	mov    $0x1,%edi
    24ad:	b8 01 00 00 00       	mov    $0x1,%eax
    24b2:	48 8d 35 b7 96 00 00 	lea    0x96b7(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>
    24b9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    24bd:	e8 be eb ff ff       	callq  1080 <__printf_chk@plt>
    24c2:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    24c6:	ba 0f 00 00 00       	mov    $0xf,%edx
    24cb:	48 8d 35 82 8e 00 00 	lea    0x8e82(%rip),%rsi        # b354 <_IO_stdin_used+0x354>
    24d2:	4c 89 ef             	mov    %r13,%rdi
    24d5:	e8 56 eb ff ff       	callq  1030 <strncmp@plt>
    24da:	85 c0                	test   %eax,%eax
    24dc:	0f 85 eb 0a 00 00    	jne    2fcd <main+0x1f0d>
    24e2:	48 8d 35 0b 57 00 00 	lea    0x570b(%rip),%rsi        # 7bf4 <mix256faddintadd>
    24e9:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    24ef:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    24f4:	e8 47 85 00 00       	callq  aa40 <measureFunction>
    24f9:	bf 01 00 00 00       	mov    $0x1,%edi
    24fe:	b8 01 00 00 00       	mov    $0x1,%eax
    2503:	48 8d 35 3e 96 00 00 	lea    0x963e(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>
    250a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    250e:	e8 6d eb ff ff       	callq  1080 <__printf_chk@plt>
    2513:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2517:	ba 08 00 00 00       	mov    $0x8,%edx
    251c:	48 8d 35 42 8e 00 00 	lea    0x8e42(%rip),%rsi        # b365 <_IO_stdin_used+0x365>
    2523:	4c 89 ef             	mov    %r13,%rdi
    2526:	e8 05 eb ff ff       	callq  1030 <strncmp@plt>
    252b:	85 c0                	test   %eax,%eax
    252d:	0f 85 b6 0a 00 00    	jne    2fe9 <main+0x1f29>
    2533:	48 8d 35 90 75 00 00 	lea    0x7590(%rip),%rsi        # 9aca <latmul16>
    253a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2540:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2545:	e8 f6 84 00 00       	callq  aa40 <measureFunction>
    254a:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    2550:	bf 01 00 00 00       	mov    $0x1,%edi
    2555:	48 8d 35 c4 95 00 00 	lea    0x95c4(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>
    255c:	b8 01 00 00 00       	mov    $0x1,%eax
    2561:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    2565:	66 0f ef c0          	pxor   %xmm0,%xmm0
    2569:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    256d:	e8 0e eb ff ff       	callq  1080 <__printf_chk@plt>
    2572:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2576:	ba 08 00 00 00       	mov    $0x8,%edx
    257b:	48 8d 35 ec 8d 00 00 	lea    0x8dec(%rip),%rsi        # b36e <_IO_stdin_used+0x36e>
    2582:	4c 89 ef             	mov    %r13,%rdi
    2585:	e8 a6 ea ff ff       	callq  1030 <strncmp@plt>
    258a:	85 c0                	test   %eax,%eax
    258c:	0f 85 73 0a 00 00    	jne    3005 <main+0x1f45>
    2592:	48 8d 35 91 74 00 00 	lea    0x7491(%rip),%rsi        # 9a2a <latmul64>
    2599:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    259f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    25a4:	e8 97 84 00 00       	callq  aa40 <measureFunction>
    25a9:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    25af:	bf 01 00 00 00       	mov    $0x1,%edi
    25b4:	48 8d 35 3d 95 00 00 	lea    0x953d(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>
    25bb:	b8 01 00 00 00       	mov    $0x1,%eax
    25c0:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    25c4:	66 0f ef c0          	pxor   %xmm0,%xmm0
    25c8:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    25cc:	e8 af ea ff ff       	callq  1080 <__printf_chk@plt>
    25d1:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    25d5:	ba 05 00 00 00       	mov    $0x5,%edx
    25da:	48 8d 35 87 8d 00 00 	lea    0x8d87(%rip),%rsi        # b368 <_IO_stdin_used+0x368>
    25e1:	4c 89 ef             	mov    %r13,%rdi
    25e4:	e8 47 ea ff ff       	callq  1030 <strncmp@plt>
    25e9:	85 c0                	test   %eax,%eax
    25eb:	0f 85 30 0a 00 00    	jne    3021 <main+0x1f61>
    25f1:	48 8d 35 86 75 00 00 	lea    0x7586(%rip),%rsi        # 9b7e <mul16>
    25f8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    25fe:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2603:	e8 38 84 00 00       	callq  aa40 <measureFunction>
    2608:	bf 01 00 00 00       	mov    $0x1,%edi
    260d:	b8 01 00 00 00       	mov    $0x1,%eax
    2612:	48 8d 35 57 8e 00 00 	lea    0x8e57(%rip),%rsi        # b470 <_IO_stdin_used+0x470>
    2619:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    261d:	e8 5e ea ff ff       	callq  1080 <__printf_chk@plt>
    2622:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2626:	ba 05 00 00 00       	mov    $0x5,%edx
    262b:	48 8d 35 5d 8d 00 00 	lea    0x8d5d(%rip),%rsi        # b38f <_IO_stdin_used+0x38f>
    2632:	4c 89 ef             	mov    %r13,%rdi
    2635:	e8 f6 e9 ff ff       	callq  1030 <strncmp@plt>
    263a:	85 c0                	test   %eax,%eax
    263c:	0f 85 fb 09 00 00    	jne    303d <main+0x1f7d>
    2642:	48 8d 35 e9 75 00 00 	lea    0x75e9(%rip),%rsi        # 9c32 <mul64>
    2649:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    264f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2654:	e8 e7 83 00 00       	callq  aa40 <measureFunction>
    2659:	bf 01 00 00 00       	mov    $0x1,%edi
    265e:	b8 01 00 00 00       	mov    $0x1,%eax
    2663:	48 8d 35 eb 8d 00 00 	lea    0x8deb(%rip),%rsi        # b455 <_IO_stdin_used+0x455>
    266a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    266e:	e8 0d ea ff ff       	callq  1080 <__printf_chk@plt>
    2673:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2677:	ba 05 00 00 00       	mov    $0x5,%edx
    267c:	48 8d 35 f4 8c 00 00 	lea    0x8cf4(%rip),%rsi        # b377 <_IO_stdin_used+0x377>
    2683:	4c 89 ef             	mov    %r13,%rdi
    2686:	e8 a5 e9 ff ff       	callq  1030 <strncmp@plt>
    268b:	85 c0                	test   %eax,%eax
    268d:	0f 85 c6 09 00 00    	jne    3059 <main+0x1f99>
    2693:	48 8d 35 7a 76 00 00 	lea    0x767a(%rip),%rsi        # 9d14 <mixmul16mul64>
    269a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    26a0:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    26a5:	e8 96 83 00 00       	callq  aa40 <measureFunction>
    26aa:	bf 01 00 00 00       	mov    $0x1,%edi
    26af:	b8 01 00 00 00       	mov    $0x1,%eax
    26b4:	48 8d 35 0d 94 00 00 	lea    0x940d(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>
    26bb:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    26bf:	e8 bc e9 ff ff       	callq  1080 <__printf_chk@plt>
    26c4:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    26c8:	ba 05 00 00 00       	mov    $0x5,%edx
    26cd:	48 8d 35 b1 8c 00 00 	lea    0x8cb1(%rip),%rsi        # b385 <_IO_stdin_used+0x385>
    26d4:	4c 89 ef             	mov    %r13,%rdi
    26d7:	e8 54 e9 ff ff       	callq  1030 <strncmp@plt>
    26dc:	85 c0                	test   %eax,%eax
    26de:	0f 85 91 09 00 00    	jne    3075 <main+0x1fb5>
    26e4:	48 8d 35 d5 76 00 00 	lea    0x76d5(%rip),%rsi        # 9dc0 <mixmul16mul64_21>
    26eb:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    26f1:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    26f6:	e8 45 83 00 00       	callq  aa40 <measureFunction>
    26fb:	bf 01 00 00 00       	mov    $0x1,%edi
    2700:	b8 01 00 00 00       	mov    $0x1,%eax
    2705:	48 8d 35 8c 93 00 00 	lea    0x938c(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>
    270c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2710:	e8 6b e9 ff ff       	callq  1080 <__printf_chk@plt>
    2715:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    2719:	ba 07 00 00 00       	mov    $0x7,%edx
    271e:	48 8d 35 76 8c 00 00 	lea    0x8c76(%rip),%rsi        # b39b <_IO_stdin_used+0x39b>
    2725:	4c 89 ef             	mov    %r13,%rdi
    2728:	e8 03 e9 ff ff       	callq  1030 <strncmp@plt>
    272d:	85 c0                	test   %eax,%eax
    272f:	0f 85 5c 09 00 00    	jne    3091 <main+0x1fd1>
    2735:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    273b:	48 8d 35 2e 82 00 00 	lea    0x822e(%rip),%rsi        # a970 <load128wrapper>
    2742:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2747:	e8 f4 82 00 00       	callq  aa40 <measureFunction>
    274c:	bf 01 00 00 00       	mov    $0x1,%edi
    2751:	b8 01 00 00 00       	mov    $0x1,%eax
    2756:	48 8d 35 db 8c 00 00 	lea    0x8cdb(%rip),%rsi        # b438 <_IO_stdin_used+0x438>
    275d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2761:	e8 1a e9 ff ff       	callq  1080 <__printf_chk@plt>
    2766:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    276a:	ba 0d 00 00 00       	mov    $0xd,%edx
    276f:	48 8d 35 1f 8c 00 00 	lea    0x8c1f(%rip),%rsi        # b395 <_IO_stdin_used+0x395>
    2776:	4c 89 ef             	mov    %r13,%rdi
    2779:	e8 b2 e8 ff ff       	callq  1030 <strncmp@plt>
    277e:	85 c0                	test   %eax,%eax
    2780:	0f 85 27 09 00 00    	jne    30ad <main+0x1fed>
    2786:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    278c:	48 8d 35 ed 81 00 00 	lea    0x81ed(%rip),%rsi        # a980 <spacedload128wrapper>
    2793:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2798:	e8 a3 82 00 00       	callq  aa40 <measureFunction>
    279d:	bf 01 00 00 00       	mov    $0x1,%edi
    27a2:	b8 01 00 00 00       	mov    $0x1,%eax
    27a7:	48 8d 35 c2 92 00 00 	lea    0x92c2(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>
    27ae:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    27b2:	e8 c9 e8 ff ff       	callq  1080 <__printf_chk@plt>
    27b7:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    27bb:	ba 07 00 00 00       	mov    $0x7,%edx
    27c0:	48 8d 35 dc 8b 00 00 	lea    0x8bdc(%rip),%rsi        # b3a3 <_IO_stdin_used+0x3a3>
    27c7:	4c 89 ef             	mov    %r13,%rdi
    27ca:	e8 61 e8 ff ff       	callq  1030 <strncmp@plt>
    27cf:	85 c0                	test   %eax,%eax
    27d1:	0f 85 f2 08 00 00    	jne    30c9 <main+0x2009>
    27d7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    27dd:	48 8d 35 bc 81 00 00 	lea    0x81bc(%rip),%rsi        # a9a0 <load256wrapper>
    27e4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    27e9:	e8 52 82 00 00       	callq  aa40 <measureFunction>
    27ee:	bf 01 00 00 00       	mov    $0x1,%edi
    27f3:	b8 01 00 00 00       	mov    $0x1,%eax
    27f8:	48 8d 35 1c 8c 00 00 	lea    0x8c1c(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>
    27ff:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2803:	e8 78 e8 ff ff       	callq  1080 <__printf_chk@plt>
    2808:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    280c:	ba 0d 00 00 00       	mov    $0xd,%edx
    2811:	48 8d 35 93 8b 00 00 	lea    0x8b93(%rip),%rsi        # b3ab <_IO_stdin_used+0x3ab>
    2818:	4c 89 ef             	mov    %r13,%rdi
    281b:	e8 10 e8 ff ff       	callq  1030 <strncmp@plt>
    2820:	85 c0                	test   %eax,%eax
    2822:	0f 85 bd 08 00 00    	jne    30e5 <main+0x2025>
    2828:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    282e:	48 8d 35 5b 81 00 00 	lea    0x815b(%rip),%rsi        # a990 <spacedstorescalarwrapper>
    2835:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    283a:	e8 01 82 00 00       	callq  aa40 <measureFunction>
    283f:	bf 01 00 00 00       	mov    $0x1,%edi
    2844:	b8 01 00 00 00       	mov    $0x1,%eax
    2849:	48 8d 35 f8 91 00 00 	lea    0x91f8(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>
    2850:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2854:	e8 27 e8 ff ff       	callq  1080 <__printf_chk@plt>
    2859:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    285d:	ba 07 00 00 00       	mov    $0x7,%edx
    2862:	48 8d 35 54 8b 00 00 	lea    0x8b54(%rip),%rsi        # b3bd <_IO_stdin_used+0x3bd>
    2869:	4c 89 ef             	mov    %r13,%rdi
    286c:	e8 bf e7 ff ff       	callq  1030 <strncmp@plt>
    2871:	85 c0                	test   %eax,%eax
    2873:	75 35                	jne    28aa <main+0x17ea>
    2875:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    287b:	48 8d 35 3e 81 00 00 	lea    0x813e(%rip),%rsi        # a9c0 <store128wrapper>
    2882:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2887:	e8 b4 81 00 00       	callq  aa40 <measureFunction>
    288c:	bf 01 00 00 00       	mov    $0x1,%edi
    2891:	b8 01 00 00 00       	mov    $0x1,%eax
    2896:	48 8d 35 60 8b 00 00 	lea    0x8b60(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>
    289d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    28a1:	e8 da e7 ff ff       	callq  1080 <__printf_chk@plt>
    28a6:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    28aa:	ba 07 00 00 00       	mov    $0x7,%edx
    28af:	48 8d 35 10 8b 00 00 	lea    0x8b10(%rip),%rsi        # b3c6 <_IO_stdin_used+0x3c6>
    28b6:	4c 89 ef             	mov    %r13,%rdi
    28b9:	e8 72 e7 ff ff       	callq  1030 <strncmp@plt>
    28be:	85 c0                	test   %eax,%eax
    28c0:	75 3b                	jne    28fd <main+0x183d>
    28c2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    28c8:	48 8d 35 11 81 00 00 	lea    0x8111(%rip),%rsi        # a9e0 <store256wrapper>
    28cf:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    28d4:	e8 67 81 00 00       	callq  aa40 <measureFunction>
    28d9:	bf 01 00 00 00       	mov    $0x1,%edi
    28de:	b8 01 00 00 00       	mov    $0x1,%eax
    28e3:	48 8d 35 f5 8a 00 00 	lea    0x8af5(%rip),%rsi        # b3df <_IO_stdin_used+0x3df>
    28ea:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    28ee:	e8 8d e7 ff ff       	callq  1080 <__printf_chk@plt>
    28f3:	41 83 ec 01          	sub    $0x1,%r12d
    28f7:	7e 4d                	jle    2946 <main+0x1886>
    28f9:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    28fd:	ba 0f 00 00 00       	mov    $0xf,%edx
    2902:	48 8d 35 c6 8a 00 00 	lea    0x8ac6(%rip),%rsi        # b3cf <_IO_stdin_used+0x3cf>
    2909:	4c 89 ef             	mov    %r13,%rdi
    290c:	e8 1f e7 ff ff       	callq  1030 <strncmp@plt>
    2911:	85 c0                	test   %eax,%eax
    2913:	75 31                	jne    2946 <main+0x1886>
    2915:	48 8d 35 ff 5f 00 00 	lea    0x5fff(%rip),%rsi        # 891b <mixaddmul128int>
    291c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    2922:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    2927:	e8 14 81 00 00       	callq  aa40 <measureFunction>
    292c:	bf 01 00 00 00       	mov    $0x1,%edi
    2931:	b8 01 00 00 00       	mov    $0x1,%eax
    2936:	48 8d 35 db 90 00 00 	lea    0x90db(%rip),%rsi        # ba18 <_IO_stdin_used+0xa18>
    293d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    2941:	e8 3a e7 ff ff       	callq  1080 <__printf_chk@plt>
    2946:	48 8b 44 24 48       	mov    0x48(%rsp),%rax
    294b:	64 48 33 04 25 28 00 	xor    %fs:0x28,%rax
    2952:	00 00 
    2954:	0f 85 ce 3b 00 00    	jne    6528 <main+0x5468>
    295a:	48 83 c4 58          	add    $0x58,%rsp
    295e:	31 c0                	xor    %eax,%eax
    2960:	5b                   	pop    %rbx
    2961:	5d                   	pop    %rbp
    2962:	41 5c                	pop    %r12
    2964:	41 5d                	pop    %r13
    2966:	41 5e                	pop    %r14
    2968:	41 5f                	pop    %r15
    296a:	c3                   	retq   
    296b:	ba 05 00 00 00       	mov    $0x5,%edx
    2970:	48 8d 35 f7 8d 00 00 	lea    0x8df7(%rip),%rsi        # b76e <_IO_stdin_used+0x76e>
    2977:	4c 89 ef             	mov    %r13,%rdi
    297a:	e8 b1 e6 ff ff       	callq  1030 <strncmp@plt>
    297f:	85 c0                	test   %eax,%eax
    2981:	0f 84 34 3b 00 00    	je     64bb <main+0x53fb>
    2987:	f3 0f 10 35 f9 96 00 	movss  0x96f9(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>
    298e:	00 
    298f:	f3 0f 11 74 24 0c    	movss  %xmm6,0xc(%rsp)
    2995:	f3 0f 11 74 24 08    	movss  %xmm6,0x8(%rsp)
    299b:	ba 05 00 00 00       	mov    $0x5,%edx
    29a0:	48 8d 35 1e 88 00 00 	lea    0x881e(%rip),%rsi        # b1c5 <_IO_stdin_used+0x1c5>
    29a7:	4c 89 ef             	mov    %r13,%rdi
    29aa:	e8 81 e6 ff ff       	callq  1030 <strncmp@plt>
    29af:	85 c0                	test   %eax,%eax
    29b1:	0f 84 43 2f 00 00    	je     58fa <main+0x483a>
    29b7:	ba 03 00 00 00       	mov    $0x3,%edx
    29bc:	48 8d 35 4c 88 00 00 	lea    0x884c(%rip),%rsi        # b20f <_IO_stdin_used+0x20f>
    29c3:	4c 89 ef             	mov    %r13,%rdi
    29c6:	e8 65 e6 ff ff       	callq  1030 <strncmp@plt>
    29cb:	85 c0                	test   %eax,%eax
    29cd:	0f 84 e9 2e 00 00    	je     58bc <main+0x47fc>
    29d3:	48 8d 35 f1 87 00 00 	lea    0x87f1(%rip),%rsi        # b1cb <_IO_stdin_used+0x1cb>
    29da:	4c 89 ef             	mov    %r13,%rdi
    29dd:	e8 7e e6 ff ff       	callq  1060 <strcmp@plt>
    29e2:	85 c0                	test   %eax,%eax
    29e4:	0f 84 91 2e 00 00    	je     587b <main+0x47bb>
    29ea:	48 8d 35 e1 87 00 00 	lea    0x87e1(%rip),%rsi        # b1d2 <_IO_stdin_used+0x1d2>
    29f1:	4c 89 ef             	mov    %r13,%rdi
    29f4:	e8 67 e6 ff ff       	callq  1060 <strcmp@plt>
    29f9:	85 c0                	test   %eax,%eax
    29fb:	0f 84 3c 2e 00 00    	je     583d <main+0x477d>
    2a01:	ba 06 00 00 00       	mov    $0x6,%edx
    2a06:	48 8d 35 ce 87 00 00 	lea    0x87ce(%rip),%rsi        # b1db <_IO_stdin_used+0x1db>
    2a0d:	4c 89 ef             	mov    %r13,%rdi
    2a10:	e8 1b e6 ff ff       	callq  1030 <strncmp@plt>
    2a15:	85 c0                	test   %eax,%eax
    2a17:	0f 84 e2 2d 00 00    	je     57ff <main+0x473f>
    2a1d:	ba 08 00 00 00       	mov    $0x8,%edx
    2a22:	48 8d 35 b0 87 00 00 	lea    0x87b0(%rip),%rsi        # b1d9 <_IO_stdin_used+0x1d9>
    2a29:	4c 89 ef             	mov    %r13,%rdi
    2a2c:	e8 ff e5 ff ff       	callq  1030 <strncmp@plt>
    2a31:	85 c0                	test   %eax,%eax
    2a33:	0f 84 88 2d 00 00    	je     57c1 <main+0x4701>
    2a39:	ba 07 00 00 00       	mov    $0x7,%edx
    2a3e:	48 8d 35 9d 87 00 00 	lea    0x879d(%rip),%rsi        # b1e2 <_IO_stdin_used+0x1e2>
    2a45:	4c 89 ef             	mov    %r13,%rdi
    2a48:	e8 e3 e5 ff ff       	callq  1030 <strncmp@plt>
    2a4d:	85 c0                	test   %eax,%eax
    2a4f:	0f 84 2e 2d 00 00    	je     5783 <main+0x46c3>
    2a55:	ba 07 00 00 00       	mov    $0x7,%edx
    2a5a:	48 8d 35 89 87 00 00 	lea    0x8789(%rip),%rsi        # b1ea <_IO_stdin_used+0x1ea>
    2a61:	4c 89 ef             	mov    %r13,%rdi
    2a64:	e8 c7 e5 ff ff       	callq  1030 <strncmp@plt>
    2a69:	85 c0                	test   %eax,%eax
    2a6b:	0f 84 d4 2c 00 00    	je     5745 <main+0x4685>
    2a71:	ba 07 00 00 00       	mov    $0x7,%edx
    2a76:	48 8d 35 75 87 00 00 	lea    0x8775(%rip),%rsi        # b1f2 <_IO_stdin_used+0x1f2>
    2a7d:	4c 89 ef             	mov    %r13,%rdi
    2a80:	e8 ab e5 ff ff       	callq  1030 <strncmp@plt>
    2a85:	85 c0                	test   %eax,%eax
    2a87:	0f 84 7a 2c 00 00    	je     5707 <main+0x4647>
    2a8d:	ba 06 00 00 00       	mov    $0x6,%edx
    2a92:	48 8d 35 61 87 00 00 	lea    0x8761(%rip),%rsi        # b1fa <_IO_stdin_used+0x1fa>
    2a99:	4c 89 ef             	mov    %r13,%rdi
    2a9c:	e8 8f e5 ff ff       	callq  1030 <strncmp@plt>
    2aa1:	85 c0                	test   %eax,%eax
    2aa3:	0f 84 20 2c 00 00    	je     56c9 <main+0x4609>
    2aa9:	ba 06 00 00 00       	mov    $0x6,%edx
    2aae:	48 8d 35 4c 87 00 00 	lea    0x874c(%rip),%rsi        # b201 <_IO_stdin_used+0x201>
    2ab5:	4c 89 ef             	mov    %r13,%rdi
    2ab8:	e8 73 e5 ff ff       	callq  1030 <strncmp@plt>
    2abd:	85 c0                	test   %eax,%eax
    2abf:	0f 84 c6 2b 00 00    	je     568b <main+0x45cb>
    2ac5:	ba 06 00 00 00       	mov    $0x6,%edx
    2aca:	48 8d 35 96 8c 00 00 	lea    0x8c96(%rip),%rsi        # b767 <_IO_stdin_used+0x767>
    2ad1:	4c 89 ef             	mov    %r13,%rdi
    2ad4:	e8 57 e5 ff ff       	callq  1030 <strncmp@plt>
    2ad9:	85 c0                	test   %eax,%eax
    2adb:	0f 84 6c 2b 00 00    	je     564d <main+0x458d>
    2ae1:	ba 0a 00 00 00       	mov    $0xa,%edx
    2ae6:	48 8d 35 1b 87 00 00 	lea    0x871b(%rip),%rsi        # b208 <_IO_stdin_used+0x208>
    2aed:	4c 89 ef             	mov    %r13,%rdi
    2af0:	e8 3b e5 ff ff       	callq  1030 <strncmp@plt>
    2af5:	85 c0                	test   %eax,%eax
    2af7:	0f 84 12 2b 00 00    	je     560f <main+0x454f>
    2afd:	ba 06 00 00 00       	mov    $0x6,%edx
    2b02:	48 8d 35 0a 87 00 00 	lea    0x870a(%rip),%rsi        # b213 <_IO_stdin_used+0x213>
    2b09:	4c 89 ef             	mov    %r13,%rdi
    2b0c:	e8 1f e5 ff ff       	callq  1030 <strncmp@plt>
    2b11:	85 c0                	test   %eax,%eax
    2b13:	0f 84 b8 2a 00 00    	je     55d1 <main+0x4511>
    2b19:	ba 03 00 00 00       	mov    $0x3,%edx
    2b1e:	48 8d 35 f7 86 00 00 	lea    0x86f7(%rip),%rsi        # b21c <_IO_stdin_used+0x21c>
    2b25:	4c 89 ef             	mov    %r13,%rdi
    2b28:	e8 03 e5 ff ff       	callq  1030 <strncmp@plt>
    2b2d:	85 c0                	test   %eax,%eax
    2b2f:	0f 84 5e 2a 00 00    	je     5593 <main+0x44d3>
    2b35:	ba 05 00 00 00       	mov    $0x5,%edx
    2b3a:	48 8d 35 d9 86 00 00 	lea    0x86d9(%rip),%rsi        # b21a <_IO_stdin_used+0x21a>
    2b41:	4c 89 ef             	mov    %r13,%rdi
    2b44:	e8 e7 e4 ff ff       	callq  1030 <strncmp@plt>
    2b49:	85 c0                	test   %eax,%eax
    2b4b:	0f 84 04 2a 00 00    	je     5555 <main+0x4495>
    2b51:	ba 04 00 00 00       	mov    $0x4,%edx
    2b56:	48 8d 35 c3 86 00 00 	lea    0x86c3(%rip),%rsi        # b220 <_IO_stdin_used+0x220>
    2b5d:	4c 89 ef             	mov    %r13,%rdi
    2b60:	e8 cb e4 ff ff       	callq  1030 <strncmp@plt>
    2b65:	85 c0                	test   %eax,%eax
    2b67:	0f 84 aa 29 00 00    	je     5517 <main+0x4457>
    2b6d:	ba 04 00 00 00       	mov    $0x4,%edx
    2b72:	48 8d 35 ac 86 00 00 	lea    0x86ac(%rip),%rsi        # b225 <_IO_stdin_used+0x225>
    2b79:	4c 89 ef             	mov    %r13,%rdi
    2b7c:	e8 af e4 ff ff       	callq  1030 <strncmp@plt>
    2b81:	85 c0                	test   %eax,%eax
    2b83:	0f 84 50 29 00 00    	je     54d9 <main+0x4419>
    2b89:	ba 07 00 00 00       	mov    $0x7,%edx
    2b8e:	48 8d 35 95 86 00 00 	lea    0x8695(%rip),%rsi        # b22a <_IO_stdin_used+0x22a>
    2b95:	4c 89 ef             	mov    %r13,%rdi
    2b98:	e8 93 e4 ff ff       	callq  1030 <strncmp@plt>
    2b9d:	85 c0                	test   %eax,%eax
    2b9f:	0f 84 f6 28 00 00    	je     549b <main+0x43db>
    2ba5:	ba 03 00 00 00       	mov    $0x3,%edx
    2baa:	48 8d 35 8b 86 00 00 	lea    0x868b(%rip),%rsi        # b23c <_IO_stdin_used+0x23c>
    2bb1:	4c 89 ef             	mov    %r13,%rdi
    2bb4:	e8 77 e4 ff ff       	callq  1030 <strncmp@plt>
    2bb9:	85 c0                	test   %eax,%eax
    2bbb:	0f 84 9c 28 00 00    	je     545d <main+0x439d>
    2bc1:	ba 03 00 00 00       	mov    $0x3,%edx
    2bc6:	48 8d 35 65 86 00 00 	lea    0x8665(%rip),%rsi        # b232 <_IO_stdin_used+0x232>
    2bcd:	4c 89 ef             	mov    %r13,%rdi
    2bd0:	e8 5b e4 ff ff       	callq  1030 <strncmp@plt>
    2bd5:	85 c0                	test   %eax,%eax
    2bd7:	0f 84 42 28 00 00    	je     541f <main+0x435f>
    2bdd:	ba 09 00 00 00       	mov    $0x9,%edx
    2be2:	48 8d 35 4d 86 00 00 	lea    0x864d(%rip),%rsi        # b236 <_IO_stdin_used+0x236>
    2be9:	4c 89 ef             	mov    %r13,%rdi
    2bec:	e8 3f e4 ff ff       	callq  1030 <strncmp@plt>
    2bf1:	85 c0                	test   %eax,%eax
    2bf3:	0f 84 e8 27 00 00    	je     53e1 <main+0x4321>
    2bf9:	ba 03 00 00 00       	mov    $0x3,%edx
    2bfe:	48 8d 35 3b 86 00 00 	lea    0x863b(%rip),%rsi        # b240 <_IO_stdin_used+0x240>
    2c05:	4c 89 ef             	mov    %r13,%rdi
    2c08:	e8 23 e4 ff ff       	callq  1030 <strncmp@plt>
    2c0d:	85 c0                	test   %eax,%eax
    2c0f:	0f 84 8e 27 00 00    	je     53a3 <main+0x42e3>
    2c15:	ba 03 00 00 00       	mov    $0x3,%edx
    2c1a:	48 8d 35 2f 86 00 00 	lea    0x862f(%rip),%rsi        # b250 <_IO_stdin_used+0x250>
    2c21:	4c 89 ef             	mov    %r13,%rdi
    2c24:	e8 07 e4 ff ff       	callq  1030 <strncmp@plt>
    2c29:	85 c0                	test   %eax,%eax
    2c2b:	0f 84 34 27 00 00    	je     5365 <main+0x42a5>
    2c31:	ba 09 00 00 00       	mov    $0x9,%edx
    2c36:	48 8d 35 0d 86 00 00 	lea    0x860d(%rip),%rsi        # b24a <_IO_stdin_used+0x24a>
    2c3d:	4c 89 ef             	mov    %r13,%rdi
    2c40:	e8 eb e3 ff ff       	callq  1030 <strncmp@plt>
    2c45:	85 c0                	test   %eax,%eax
    2c47:	0f 84 da 26 00 00    	je     5327 <main+0x4267>
    2c4d:	ba 09 00 00 00       	mov    $0x9,%edx
    2c52:	48 8d 35 fb 85 00 00 	lea    0x85fb(%rip),%rsi        # b254 <_IO_stdin_used+0x254>
    2c59:	4c 89 ef             	mov    %r13,%rdi
    2c5c:	e8 cf e3 ff ff       	callq  1030 <strncmp@plt>
    2c61:	85 c0                	test   %eax,%eax
    2c63:	0f 84 80 26 00 00    	je     52e9 <main+0x4229>
    2c69:	ba 03 00 00 00       	mov    $0x3,%edx
    2c6e:	48 8d 35 ef 85 00 00 	lea    0x85ef(%rip),%rsi        # b264 <_IO_stdin_used+0x264>
    2c75:	4c 89 ef             	mov    %r13,%rdi
    2c78:	e8 b3 e3 ff ff       	callq  1030 <strncmp@plt>
    2c7d:	85 c0                	test   %eax,%eax
    2c7f:	0f 84 26 26 00 00    	je     52ab <main+0x41eb>
    2c85:	ba 09 00 00 00       	mov    $0x9,%edx
    2c8a:	48 8d 35 cd 85 00 00 	lea    0x85cd(%rip),%rsi        # b25e <_IO_stdin_used+0x25e>
    2c91:	4c 89 ef             	mov    %r13,%rdi
    2c94:	e8 97 e3 ff ff       	callq  1030 <strncmp@plt>
    2c99:	85 c0                	test   %eax,%eax
    2c9b:	0f 84 cc 25 00 00    	je     526d <main+0x41ad>
    2ca1:	ba 09 00 00 00       	mov    $0x9,%edx
    2ca6:	48 8d 35 cd 85 00 00 	lea    0x85cd(%rip),%rsi        # b27a <_IO_stdin_used+0x27a>
    2cad:	4c 89 ef             	mov    %r13,%rdi
    2cb0:	e8 7b e3 ff ff       	callq  1030 <strncmp@plt>
    2cb5:	85 c0                	test   %eax,%eax
    2cb7:	0f 84 72 25 00 00    	je     522f <main+0x416f>
    2cbd:	ba 0c 00 00 00       	mov    $0xc,%edx
    2cc2:	48 8d 35 9f 85 00 00 	lea    0x859f(%rip),%rsi        # b268 <_IO_stdin_used+0x268>
    2cc9:	4c 89 ef             	mov    %r13,%rdi
    2ccc:	e8 5f e3 ff ff       	callq  1030 <strncmp@plt>
    2cd1:	85 c0                	test   %eax,%eax
    2cd3:	0f 84 18 25 00 00    	je     51f1 <main+0x4131>
    2cd9:	ba 0e 00 00 00       	mov    $0xe,%edx
    2cde:	48 8d 35 90 85 00 00 	lea    0x8590(%rip),%rsi        # b275 <_IO_stdin_used+0x275>
    2ce5:	4c 89 ef             	mov    %r13,%rdi
    2ce8:	e8 43 e3 ff ff       	callq  1030 <strncmp@plt>
    2ced:	85 c0                	test   %eax,%eax
    2cef:	0f 84 be 24 00 00    	je     51b3 <main+0x40f3>
    2cf5:	ba 0e 00 00 00       	mov    $0xe,%edx
    2cfa:	48 8d 35 83 85 00 00 	lea    0x8583(%rip),%rsi        # b284 <_IO_stdin_used+0x284>
    2d01:	4c 89 ef             	mov    %r13,%rdi
    2d04:	e8 27 e3 ff ff       	callq  1030 <strncmp@plt>
    2d09:	85 c0                	test   %eax,%eax
    2d0b:	0f 84 64 24 00 00    	je     5175 <main+0x40b5>
    2d11:	ba 08 00 00 00       	mov    $0x8,%edx
    2d16:	48 8d 35 76 85 00 00 	lea    0x8576(%rip),%rsi        # b293 <_IO_stdin_used+0x293>
    2d1d:	4c 89 ef             	mov    %r13,%rdi
    2d20:	e8 0b e3 ff ff       	callq  1030 <strncmp@plt>
    2d25:	85 c0                	test   %eax,%eax
    2d27:	0f 84 0a 24 00 00    	je     5137 <main+0x4077>
    2d2d:	ba 0c 00 00 00       	mov    $0xc,%edx
    2d32:	48 8d 35 63 85 00 00 	lea    0x8563(%rip),%rsi        # b29c <_IO_stdin_used+0x29c>
    2d39:	4c 89 ef             	mov    %r13,%rdi
    2d3c:	e8 ef e2 ff ff       	callq  1030 <strncmp@plt>
    2d41:	85 c0                	test   %eax,%eax
    2d43:	0f 84 a2 23 00 00    	je     50eb <main+0x402b>
    2d49:	ba 0c 00 00 00       	mov    $0xc,%edx
    2d4e:	48 8d 35 54 85 00 00 	lea    0x8554(%rip),%rsi        # b2a9 <_IO_stdin_used+0x2a9>
    2d55:	4c 89 ef             	mov    %r13,%rdi
    2d58:	e8 d3 e2 ff ff       	callq  1030 <strncmp@plt>
    2d5d:	85 c0                	test   %eax,%eax
    2d5f:	0f 84 38 23 00 00    	je     509d <main+0x3fdd>
    2d65:	ba 0c 00 00 00       	mov    $0xc,%edx
    2d6a:	48 8d 35 45 85 00 00 	lea    0x8545(%rip),%rsi        # b2b6 <_IO_stdin_used+0x2b6>
    2d71:	4c 89 ef             	mov    %r13,%rdi
    2d74:	e8 b7 e2 ff ff       	callq  1030 <strncmp@plt>
    2d79:	85 c0                	test   %eax,%eax
    2d7b:	0f 84 d0 22 00 00    	je     5051 <main+0x3f91>
    2d81:	ba 0c 00 00 00       	mov    $0xc,%edx
    2d86:	48 8d 35 36 85 00 00 	lea    0x8536(%rip),%rsi        # b2c3 <_IO_stdin_used+0x2c3>
    2d8d:	4c 89 ef             	mov    %r13,%rdi
    2d90:	e8 9b e2 ff ff       	callq  1030 <strncmp@plt>
    2d95:	85 c0                	test   %eax,%eax
    2d97:	0f 84 66 22 00 00    	je     5003 <main+0x3f43>
    2d9d:	ba 0b 00 00 00       	mov    $0xb,%edx
    2da2:	48 8d 35 27 85 00 00 	lea    0x8527(%rip),%rsi        # b2d0 <_IO_stdin_used+0x2d0>
    2da9:	4c 89 ef             	mov    %r13,%rdi
    2dac:	e8 7f e2 ff ff       	callq  1030 <strncmp@plt>
    2db1:	85 c0                	test   %eax,%eax
    2db3:	0f 84 fc 21 00 00    	je     4fb5 <main+0x3ef5>
    2db9:	ba 0b 00 00 00       	mov    $0xb,%edx
    2dbe:	48 8d 35 17 85 00 00 	lea    0x8517(%rip),%rsi        # b2dc <_IO_stdin_used+0x2dc>
    2dc5:	4c 89 ef             	mov    %r13,%rdi
    2dc8:	e8 63 e2 ff ff       	callq  1030 <strncmp@plt>
    2dcd:	85 c0                	test   %eax,%eax
    2dcf:	0f 84 92 21 00 00    	je     4f67 <main+0x3ea7>
    2dd5:	ba 0b 00 00 00       	mov    $0xb,%edx
    2dda:	48 8d 35 07 85 00 00 	lea    0x8507(%rip),%rsi        # b2e8 <_IO_stdin_used+0x2e8>
    2de1:	4c 89 ef             	mov    %r13,%rdi
    2de4:	e8 47 e2 ff ff       	callq  1030 <strncmp@plt>
    2de9:	85 c0                	test   %eax,%eax
    2deb:	0f 84 28 21 00 00    	je     4f19 <main+0x3e59>
    2df1:	ba 0b 00 00 00       	mov    $0xb,%edx
    2df6:	48 8d 35 f7 84 00 00 	lea    0x84f7(%rip),%rsi        # b2f4 <_IO_stdin_used+0x2f4>
    2dfd:	4c 89 ef             	mov    %r13,%rdi
    2e00:	e8 2b e2 ff ff       	callq  1030 <strncmp@plt>
    2e05:	85 c0                	test   %eax,%eax
    2e07:	0f 84 be 20 00 00    	je     4ecb <main+0x3e0b>
    2e0d:	ba 08 00 00 00       	mov    $0x8,%edx
    2e12:	48 8d 35 d2 84 00 00 	lea    0x84d2(%rip),%rsi        # b2eb <_IO_stdin_used+0x2eb>
    2e19:	4c 89 ef             	mov    %r13,%rdi
    2e1c:	e8 0f e2 ff ff       	callq  1030 <strncmp@plt>
    2e21:	85 c0                	test   %eax,%eax
    2e23:	0f 84 64 20 00 00    	je     4e8d <main+0x3dcd>
    2e29:	ba 08 00 00 00       	mov    $0x8,%edx
    2e2e:	48 8d 35 c2 84 00 00 	lea    0x84c2(%rip),%rsi        # b2f7 <_IO_stdin_used+0x2f7>
    2e35:	4c 89 ef             	mov    %r13,%rdi
    2e38:	e8 f3 e1 ff ff       	callq  1030 <strncmp@plt>
    2e3d:	85 c0                	test   %eax,%eax
    2e3f:	0f 84 0a 20 00 00    	je     4e4f <main+0x3d8f>
    2e45:	ba 09 00 00 00       	mov    $0x9,%edx
    2e4a:	48 8d 35 68 84 00 00 	lea    0x8468(%rip),%rsi        # b2b9 <_IO_stdin_used+0x2b9>
    2e51:	4c 89 ef             	mov    %r13,%rdi
    2e54:	e8 d7 e1 ff ff       	callq  1030 <strncmp@plt>
    2e59:	85 c0                	test   %eax,%eax
    2e5b:	0f 84 b0 1f 00 00    	je     4e11 <main+0x3d51>
    2e61:	ba 09 00 00 00       	mov    $0x9,%edx
    2e66:	48 8d 35 68 85 00 00 	lea    0x8568(%rip),%rsi        # b3d5 <_IO_stdin_used+0x3d5>
    2e6d:	4c 89 ef             	mov    %r13,%rdi
    2e70:	e8 bb e1 ff ff       	callq  1030 <strncmp@plt>
    2e75:	85 c0                	test   %eax,%eax
    2e77:	0f 84 56 1f 00 00    	je     4dd3 <main+0x3d13>
    2e7d:	ba 06 00 00 00       	mov    $0x6,%edx
    2e82:	48 8d 35 7a 84 00 00 	lea    0x847a(%rip),%rsi        # b303 <_IO_stdin_used+0x303>
    2e89:	4c 89 ef             	mov    %r13,%rdi
    2e8c:	e8 9f e1 ff ff       	callq  1030 <strncmp@plt>
    2e91:	85 c0                	test   %eax,%eax
    2e93:	0f 84 fa 1e 00 00    	je     4d93 <main+0x3cd3>
    2e99:	ba 06 00 00 00       	mov    $0x6,%edx
    2e9e:	48 8d 35 0c 83 00 00 	lea    0x830c(%rip),%rsi        # b1b1 <_IO_stdin_used+0x1b1>
    2ea5:	4c 89 ef             	mov    %r13,%rdi
    2ea8:	e8 83 e1 ff ff       	callq  1030 <strncmp@plt>
    2ead:	85 c0                	test   %eax,%eax
    2eaf:	0f 84 9e 1e 00 00    	je     4d53 <main+0x3c93>
    2eb5:	ba 09 00 00 00       	mov    $0x9,%edx
    2eba:	48 8d 35 3f 84 00 00 	lea    0x843f(%rip),%rsi        # b300 <_IO_stdin_used+0x300>
    2ec1:	4c 89 ef             	mov    %r13,%rdi
    2ec4:	e8 67 e1 ff ff       	callq  1030 <strncmp@plt>
    2ec9:	85 c0                	test   %eax,%eax
    2ecb:	0f 84 34 1e 00 00    	je     4d05 <main+0x3c45>
    2ed1:	ba 09 00 00 00       	mov    $0x9,%edx
    2ed6:	48 8d 35 2d 84 00 00 	lea    0x842d(%rip),%rsi        # b30a <_IO_stdin_used+0x30a>
    2edd:	4c 89 ef             	mov    %r13,%rdi
    2ee0:	e8 4b e1 ff ff       	callq  1030 <strncmp@plt>
    2ee5:	85 c0                	test   %eax,%eax
    2ee7:	0f 84 ca 1d 00 00    	je     4cb7 <main+0x3bf7>
    2eed:	ba 06 00 00 00       	mov    $0x6,%edx
    2ef2:	48 8d 35 29 84 00 00 	lea    0x8429(%rip),%rsi        # b322 <_IO_stdin_used+0x322>
    2ef9:	4c 89 ef             	mov    %r13,%rdi
    2efc:	e8 2f e1 ff ff       	callq  1030 <strncmp@plt>
    2f01:	85 c0                	test   %eax,%eax
    2f03:	0f 84 6e 1d 00 00    	je     4c77 <main+0x3bb7>
    2f09:	ba 06 00 00 00       	mov    $0x6,%edx
    2f0e:	48 8d 35 ff 83 00 00 	lea    0x83ff(%rip),%rsi        # b314 <_IO_stdin_used+0x314>
    2f15:	4c 89 ef             	mov    %r13,%rdi
    2f18:	e8 13 e1 ff ff       	callq  1030 <strncmp@plt>
    2f1d:	85 c0                	test   %eax,%eax
    2f1f:	0f 84 12 1d 00 00    	je     4c37 <main+0x3b77>
    2f25:	ba 0c 00 00 00       	mov    $0xc,%edx
    2f2a:	48 8d 35 eb 83 00 00 	lea    0x83eb(%rip),%rsi        # b31c <_IO_stdin_used+0x31c>
    2f31:	4c 89 ef             	mov    %r13,%rdi
    2f34:	e8 f7 e0 ff ff       	callq  1030 <strncmp@plt>
    2f39:	85 c0                	test   %eax,%eax
    2f3b:	0f 84 b2 1c 00 00    	je     4bf3 <main+0x3b33>
    2f41:	ba 0b 00 00 00       	mov    $0xb,%edx
    2f46:	48 8d 35 dd 83 00 00 	lea    0x83dd(%rip),%rsi        # b32a <_IO_stdin_used+0x32a>
    2f4d:	4c 89 ef             	mov    %r13,%rdi
    2f50:	e8 db e0 ff ff       	callq  1030 <strncmp@plt>
    2f55:	85 c0                	test   %eax,%eax
    2f57:	0f 84 52 1c 00 00    	je     4baf <main+0x3aef>
    2f5d:	ba 0e 00 00 00       	mov    $0xe,%edx
    2f62:	48 8d 35 ce 83 00 00 	lea    0x83ce(%rip),%rsi        # b337 <_IO_stdin_used+0x337>
    2f69:	4c 89 ef             	mov    %r13,%rdi
    2f6c:	e8 bf e0 ff ff       	callq  1030 <strncmp@plt>
    2f71:	85 c0                	test   %eax,%eax
    2f73:	0f 84 f2 1b 00 00    	je     4b6b <main+0x3aab>
    2f79:	ba 0b 00 00 00       	mov    $0xb,%edx
    2f7e:	48 8d 35 c2 83 00 00 	lea    0x83c2(%rip),%rsi        # b347 <_IO_stdin_used+0x347>
    2f85:	4c 89 ef             	mov    %r13,%rdi
    2f88:	e8 a3 e0 ff ff       	callq  1030 <strncmp@plt>
    2f8d:	85 c0                	test   %eax,%eax
    2f8f:	0f 84 92 1b 00 00    	je     4b27 <main+0x3a67>
    2f95:	ba 0d 00 00 00       	mov    $0xd,%edx
    2f9a:	48 8d 35 76 81 00 00 	lea    0x8176(%rip),%rsi        # b117 <_IO_stdin_used+0x117>
    2fa1:	4c 89 ef             	mov    %r13,%rdi
    2fa4:	e8 87 e0 ff ff       	callq  1030 <strncmp@plt>
    2fa9:	85 c0                	test   %eax,%eax
    2fab:	0f 84 32 1b 00 00    	je     4ae3 <main+0x3a23>
    2fb1:	ba 0f 00 00 00       	mov    $0xf,%edx
    2fb6:	48 8d 35 97 83 00 00 	lea    0x8397(%rip),%rsi        # b354 <_IO_stdin_used+0x354>
    2fbd:	4c 89 ef             	mov    %r13,%rdi
    2fc0:	e8 6b e0 ff ff       	callq  1030 <strncmp@plt>
    2fc5:	85 c0                	test   %eax,%eax
    2fc7:	0f 84 d6 1a 00 00    	je     4aa3 <main+0x39e3>
    2fcd:	ba 08 00 00 00       	mov    $0x8,%edx
    2fd2:	48 8d 35 8c 83 00 00 	lea    0x838c(%rip),%rsi        # b365 <_IO_stdin_used+0x365>
    2fd9:	4c 89 ef             	mov    %r13,%rdi
    2fdc:	e8 4f e0 ff ff       	callq  1030 <strncmp@plt>
    2fe1:	85 c0                	test   %eax,%eax
    2fe3:	0f 84 6c 1a 00 00    	je     4a55 <main+0x3995>
    2fe9:	ba 08 00 00 00       	mov    $0x8,%edx
    2fee:	48 8d 35 79 83 00 00 	lea    0x8379(%rip),%rsi        # b36e <_IO_stdin_used+0x36e>
    2ff5:	4c 89 ef             	mov    %r13,%rdi
    2ff8:	e8 33 e0 ff ff       	callq  1030 <strncmp@plt>
    2ffd:	85 c0                	test   %eax,%eax
    2fff:	0f 84 02 1a 00 00    	je     4a07 <main+0x3947>
    3005:	ba 05 00 00 00       	mov    $0x5,%edx
    300a:	48 8d 35 57 83 00 00 	lea    0x8357(%rip),%rsi        # b368 <_IO_stdin_used+0x368>
    3011:	4c 89 ef             	mov    %r13,%rdi
    3014:	e8 17 e0 ff ff       	callq  1030 <strncmp@plt>
    3019:	85 c0                	test   %eax,%eax
    301b:	0f 84 a6 19 00 00    	je     49c7 <main+0x3907>
    3021:	ba 05 00 00 00       	mov    $0x5,%edx
    3026:	48 8d 35 62 83 00 00 	lea    0x8362(%rip),%rsi        # b38f <_IO_stdin_used+0x38f>
    302d:	4c 89 ef             	mov    %r13,%rdi
    3030:	e8 fb df ff ff       	callq  1030 <strncmp@plt>
    3035:	85 c0                	test   %eax,%eax
    3037:	0f 84 4a 19 00 00    	je     4987 <main+0x38c7>
    303d:	ba 05 00 00 00       	mov    $0x5,%edx
    3042:	48 8d 35 2e 83 00 00 	lea    0x832e(%rip),%rsi        # b377 <_IO_stdin_used+0x377>
    3049:	4c 89 ef             	mov    %r13,%rdi
    304c:	e8 df df ff ff       	callq  1030 <strncmp@plt>
    3051:	85 c0                	test   %eax,%eax
    3053:	0f 84 ee 18 00 00    	je     4947 <main+0x3887>
    3059:	ba 05 00 00 00       	mov    $0x5,%edx
    305e:	48 8d 35 20 83 00 00 	lea    0x8320(%rip),%rsi        # b385 <_IO_stdin_used+0x385>
    3065:	4c 89 ef             	mov    %r13,%rdi
    3068:	e8 c3 df ff ff       	callq  1030 <strncmp@plt>
    306d:	85 c0                	test   %eax,%eax
    306f:	0f 84 92 18 00 00    	je     4907 <main+0x3847>
    3075:	ba 07 00 00 00       	mov    $0x7,%edx
    307a:	48 8d 35 1a 83 00 00 	lea    0x831a(%rip),%rsi        # b39b <_IO_stdin_used+0x39b>
    3081:	4c 89 ef             	mov    %r13,%rdi
    3084:	e8 a7 df ff ff       	callq  1030 <strncmp@plt>
    3089:	85 c0                	test   %eax,%eax
    308b:	0f 84 36 18 00 00    	je     48c7 <main+0x3807>
    3091:	ba 0d 00 00 00       	mov    $0xd,%edx
    3096:	48 8d 35 f8 82 00 00 	lea    0x82f8(%rip),%rsi        # b395 <_IO_stdin_used+0x395>
    309d:	4c 89 ef             	mov    %r13,%rdi
    30a0:	e8 8b df ff ff       	callq  1030 <strncmp@plt>
    30a5:	85 c0                	test   %eax,%eax
    30a7:	0f 84 da 17 00 00    	je     4887 <main+0x37c7>
    30ad:	ba 07 00 00 00       	mov    $0x7,%edx
    30b2:	48 8d 35 ea 82 00 00 	lea    0x82ea(%rip),%rsi        # b3a3 <_IO_stdin_used+0x3a3>
    30b9:	4c 89 ef             	mov    %r13,%rdi
    30bc:	e8 6f df ff ff       	callq  1030 <strncmp@plt>
    30c1:	85 c0                	test   %eax,%eax
    30c3:	0f 84 7e 17 00 00    	je     4847 <main+0x3787>
    30c9:	ba 0d 00 00 00       	mov    $0xd,%edx
    30ce:	48 8d 35 d6 82 00 00 	lea    0x82d6(%rip),%rsi        # b3ab <_IO_stdin_used+0x3ab>
    30d5:	4c 89 ef             	mov    %r13,%rdi
    30d8:	e8 53 df ff ff       	callq  1030 <strncmp@plt>
    30dd:	85 c0                	test   %eax,%eax
    30df:	0f 84 22 17 00 00    	je     4807 <main+0x3747>
    30e5:	ba 07 00 00 00       	mov    $0x7,%edx
    30ea:	48 8d 35 cc 82 00 00 	lea    0x82cc(%rip),%rsi        # b3bd <_IO_stdin_used+0x3bd>
    30f1:	4c 89 ef             	mov    %r13,%rdi
    30f4:	e8 37 df ff ff       	callq  1030 <strncmp@plt>
    30f9:	85 c0                	test   %eax,%eax
    30fb:	0f 85 a9 f7 ff ff    	jne    28aa <main+0x17ea>
    3101:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3107:	48 8d 35 b2 78 00 00 	lea    0x78b2(%rip),%rsi        # a9c0 <store128wrapper>
    310e:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3113:	e8 28 79 00 00       	callq  aa40 <measureFunction>
    3118:	bf 01 00 00 00       	mov    $0x1,%edi
    311d:	b8 01 00 00 00       	mov    $0x1,%eax
    3122:	48 8d 35 d4 82 00 00 	lea    0x82d4(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>
    3129:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    312d:	e8 4e df ff ff       	callq  1080 <__printf_chk@plt>
    3132:	41 83 fc 01          	cmp    $0x1,%r12d
    3136:	0f 8f 6a f7 ff ff    	jg     28a6 <main+0x17e6>
    313c:	e9 05 f8 ff ff       	jmpq   2946 <main+0x1886>
    3141:	48 8b 0d 98 af 00 00 	mov    0xaf98(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>
    3148:	ba 11 00 00 00       	mov    $0x11,%edx
    314d:	be 01 00 00 00       	mov    $0x1,%esi
    3152:	48 8d 3d f2 7e 00 00 	lea    0x7ef2(%rip),%rdi        # b04b <_IO_stdin_used+0x4b>
    3159:	e8 32 df ff ff       	callq  1090 <fwrite@plt>
    315e:	41 83 fc 01          	cmp    $0x1,%r12d
    3162:	0f 84 48 33 00 00    	je     64b0 <main+0x53f0>
    3168:	f2 0f 10 05 10 8f 00 	movsd  0x8f10(%rip),%xmm0        # c080 <_IO_stdin_used+0x1080>
    316f:	00 
    3170:	bf 01 00 00 00       	mov    $0x1,%edi
    3175:	b8 01 00 00 00       	mov    $0x1,%eax
    317a:	48 8d 35 f7 85 00 00 	lea    0x85f7(%rip),%rsi        # b778 <_IO_stdin_used+0x778>
    3181:	e8 fa de ff ff       	callq  1080 <__printf_chk@plt>
    3186:	41 83 fc 01          	cmp    $0x1,%r12d
    318a:	0f 8e b6 f7 ff ff    	jle    2946 <main+0x1886>
    3190:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3194:	ba 06 00 00 00       	mov    $0x6,%edx
    3199:	48 8d 35 70 7f 00 00 	lea    0x7f70(%rip),%rsi        # b110 <_IO_stdin_used+0x110>
    31a0:	4c 89 ef             	mov    %r13,%rdi
    31a3:	e8 88 de ff ff       	callq  1030 <strncmp@plt>
    31a8:	85 c0                	test   %eax,%eax
    31aa:	75 37                	jne    31e3 <main+0x2123>
    31ac:	48 8d 35 44 5a 00 00 	lea    0x5a44(%rip),%rsi        # 8bf7 <fma512>
    31b3:	f3 0f 10 05 cd 8e 00 	movss  0x8ecd(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    31ba:	00 
    31bb:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    31c0:	e8 7b 78 00 00       	callq  aa40 <measureFunction>
    31c5:	bf 01 00 00 00       	mov    $0x1,%edi
    31ca:	b8 01 00 00 00       	mov    $0x1,%eax
    31cf:	48 8d 35 87 7e 00 00 	lea    0x7e87(%rip),%rsi        # b05d <_IO_stdin_used+0x5d>
    31d6:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    31da:	e8 a1 de ff ff       	callq  1080 <__printf_chk@plt>
    31df:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    31e3:	ba 09 00 00 00       	mov    $0x9,%edx
    31e8:	48 8d 35 0e 7f 00 00 	lea    0x7f0e(%rip),%rsi        # b0fd <_IO_stdin_used+0xfd>
    31ef:	4c 89 ef             	mov    %r13,%rdi
    31f2:	e8 39 de ff ff       	callq  1030 <strncmp@plt>
    31f7:	85 c0                	test   %eax,%eax
    31f9:	0f 84 63 15 00 00    	je     4762 <main+0x36a2>
    31ff:	ba 0f 00 00 00       	mov    $0xf,%edx
    3204:	48 8d 35 fc 7e 00 00 	lea    0x7efc(%rip),%rsi        # b107 <_IO_stdin_used+0x107>
    320b:	4c 89 ef             	mov    %r13,%rdi
    320e:	e8 1d de ff ff       	callq  1030 <strncmp@plt>
    3213:	85 c0                	test   %eax,%eax
    3215:	0f 85 01 32 00 00    	jne    641c <main+0x535c>
    321b:	48 8d 35 b6 5a 00 00 	lea    0x5ab6(%rip),%rsi        # 8cd8 <mixfma256fma512>
    3222:	f3 0f 10 05 5e 8e 00 	movss  0x8e5e(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3229:	00 
    322a:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    322f:	e8 0c 78 00 00       	callq  aa40 <measureFunction>
    3234:	bf 01 00 00 00       	mov    $0x1,%edi
    3239:	b8 01 00 00 00       	mov    $0x1,%eax
    323e:	48 8d 35 83 85 00 00 	lea    0x8583(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>
    3245:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3249:	e8 32 de ff ff       	callq  1080 <__printf_chk@plt>
    324e:	41 83 fc 01          	cmp    $0x1,%r12d
    3252:	0f 8e ee f6 ff ff    	jle    2946 <main+0x1886>
    3258:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    325c:	ba 0d 00 00 00       	mov    $0xd,%edx
    3261:	48 8d 35 af 7e 00 00 	lea    0x7eaf(%rip),%rsi        # b117 <_IO_stdin_used+0x117>
    3268:	4c 89 ef             	mov    %r13,%rdi
    326b:	e8 c0 dd ff ff       	callq  1030 <strncmp@plt>
    3270:	85 c0                	test   %eax,%eax
    3272:	0f 85 4a 31 00 00    	jne    63c2 <main+0x5302>
    3278:	48 8d 35 4a 62 00 00 	lea    0x624a(%rip),%rsi        # 94c9 <nemesfpu512mix21>
    327f:	f3 0f 10 05 01 8e 00 	movss  0x8e01(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3286:	00 
    3287:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    328c:	48 c1 e7 09          	shl    $0x9,%rdi
    3290:	e8 ab 77 00 00       	callq  aa40 <measureFunction>
    3295:	bf 01 00 00 00       	mov    $0x1,%edi
    329a:	b8 01 00 00 00       	mov    $0x1,%eax
    329f:	48 8d 35 4a 85 00 00 	lea    0x854a(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>
    32a6:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    32aa:	e8 d1 dd ff ff       	callq  1080 <__printf_chk@plt>
    32af:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    32b3:	ba 09 00 00 00       	mov    $0x9,%edx
    32b8:	48 8d 35 69 7e 00 00 	lea    0x7e69(%rip),%rsi        # b128 <_IO_stdin_used+0x128>
    32bf:	4c 89 ef             	mov    %r13,%rdi
    32c2:	e8 69 dd ff ff       	callq  1030 <strncmp@plt>
    32c7:	85 c0                	test   %eax,%eax
    32c9:	0f 85 8b 30 00 00    	jne    635a <main+0x529a>
    32cf:	48 8d 35 b7 47 00 00 	lea    0x47b7(%rip),%rsi        # 7a8d <add512int>
    32d6:	f3 0f 10 05 aa 8d 00 	movss  0x8daa(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    32dd:	00 
    32de:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    32e3:	e8 58 77 00 00       	callq  aa40 <measureFunction>
    32e8:	bf 01 00 00 00       	mov    $0x1,%edi
    32ed:	b8 01 00 00 00       	mov    $0x1,%eax
    32f2:	48 8d 35 27 85 00 00 	lea    0x8527(%rip),%rsi        # b820 <_IO_stdin_used+0x820>
    32f9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    32fd:	e8 7e dd ff ff       	callq  1080 <__printf_chk@plt>
    3302:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3306:	ba 0c 00 00 00       	mov    $0xc,%edx
    330b:	48 8d 35 13 7e 00 00 	lea    0x7e13(%rip),%rsi        # b125 <_IO_stdin_used+0x125>
    3312:	4c 89 ef             	mov    %r13,%rdi
    3315:	e8 16 dd ff ff       	callq  1030 <strncmp@plt>
    331a:	85 c0                	test   %eax,%eax
    331c:	0f 85 de 2f 00 00    	jne    6300 <main+0x5240>
    3322:	48 8d 35 87 4b 00 00 	lea    0x4b87(%rip),%rsi        # 7eb0 <latadd256int>
    3329:	f3 0f 10 05 57 8d 00 	movss  0x8d57(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3330:	00 
    3331:	4c 89 f7             	mov    %r14,%rdi
    3334:	e8 07 77 00 00       	callq  aa40 <measureFunction>
    3339:	bf 01 00 00 00       	mov    $0x1,%edi
    333e:	b8 01 00 00 00       	mov    $0x1,%eax
    3343:	f3 0f 10 0d 3d 8d 00 	movss  0x8d3d(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>
    334a:	00 
    334b:	48 8d 35 ee 84 00 00 	lea    0x84ee(%rip),%rsi        # b840 <_IO_stdin_used+0x840>
    3352:	f3 0f 5e c8          	divss  %xmm0,%xmm1
    3356:	66 0f ef c0          	pxor   %xmm0,%xmm0
    335a:	f3 0f 5a c1          	cvtss2sd %xmm1,%xmm0
    335e:	e8 1d dd ff ff       	callq  1080 <__printf_chk@plt>
    3363:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3367:	ba 09 00 00 00       	mov    $0x9,%edx
    336c:	48 8d 35 d0 7d 00 00 	lea    0x7dd0(%rip),%rsi        # b143 <_IO_stdin_used+0x143>
    3373:	4c 89 ef             	mov    %r13,%rdi
    3376:	e8 b5 dc ff ff       	callq  1030 <strncmp@plt>
    337b:	85 c0                	test   %eax,%eax
    337d:	0f 85 1f 2f 00 00    	jne    62a2 <main+0x51e2>
    3383:	48 8d 35 87 45 00 00 	lea    0x4587(%rip),%rsi        # 7911 <mul512int>
    338a:	f3 0f 10 05 f6 8c 00 	movss  0x8cf6(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3391:	00 
    3392:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3397:	e8 a4 76 00 00       	callq  aa40 <measureFunction>
    339c:	bf 01 00 00 00       	mov    $0x1,%edi
    33a1:	b8 01 00 00 00       	mov    $0x1,%eax
    33a6:	48 8d 35 bb 84 00 00 	lea    0x84bb(%rip),%rsi        # b868 <_IO_stdin_used+0x868>
    33ad:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    33b1:	e8 ca dc ff ff       	callq  1080 <__printf_chk@plt>
    33b6:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    33ba:	ba 09 00 00 00       	mov    $0x9,%edx
    33bf:	48 8d 35 8a 7d 00 00 	lea    0x7d8a(%rip),%rsi        # b150 <_IO_stdin_used+0x150>
    33c6:	4c 89 ef             	mov    %r13,%rdi
    33c9:	e8 62 dc ff ff       	callq  1030 <strncmp@plt>
    33ce:	85 c0                	test   %eax,%eax
    33d0:	0f 85 5a 2e 00 00    	jne    6230 <main+0x5170>
    33d6:	48 8d 35 f2 45 00 00 	lea    0x45f2(%rip),%rsi        # 79cf <muldq512int>
    33dd:	f3 0f 10 05 a3 8c 00 	movss  0x8ca3(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    33e4:	00 
    33e5:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    33ea:	e8 51 76 00 00       	callq  aa40 <measureFunction>
    33ef:	bf 01 00 00 00       	mov    $0x1,%edi
    33f4:	b8 01 00 00 00       	mov    $0x1,%eax
    33f9:	48 8d 35 90 84 00 00 	lea    0x8490(%rip),%rsi        # b890 <_IO_stdin_used+0x890>
    3400:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3404:	e8 77 dc ff ff       	callq  1080 <__printf_chk@plt>
    3409:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    340d:	ba 0d 00 00 00       	mov    $0xd,%edx
    3412:	48 8d 35 19 7d 00 00 	lea    0x7d19(%rip),%rsi        # b132 <_IO_stdin_used+0x132>
    3419:	4c 89 ef             	mov    %r13,%rdi
    341c:	e8 0f dc ff ff       	callq  1030 <strncmp@plt>
    3421:	85 c0                	test   %eax,%eax
    3423:	0f 85 99 2d 00 00    	jne    61c2 <main+0x5102>
    3429:	48 8d 35 99 4d 00 00 	lea    0x4d99(%rip),%rsi        # 81c9 <latmulq512int>
    3430:	f3 0f 10 05 50 8c 00 	movss  0x8c50(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3437:	00 
    3438:	4c 89 f7             	mov    %r14,%rdi
    343b:	e8 00 76 00 00       	callq  aa40 <measureFunction>
    3440:	bf 01 00 00 00       	mov    $0x1,%edi
    3445:	b8 01 00 00 00       	mov    $0x1,%eax
    344a:	f3 0f 10 3d 36 8c 00 	movss  0x8c36(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>
    3451:	00 
    3452:	48 8d 35 67 84 00 00 	lea    0x8467(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>
    3459:	f3 0f 11 7c 24 0c    	movss  %xmm7,0xc(%rsp)
    345f:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    3463:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3467:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    346b:	e8 10 dc ff ff       	callq  1080 <__printf_chk@plt>
    3470:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3474:	ba 0c 00 00 00       	mov    $0xc,%edx
    3479:	48 8d 35 c0 7c 00 00 	lea    0x7cc0(%rip),%rsi        # b140 <_IO_stdin_used+0x140>
    3480:	4c 89 ef             	mov    %r13,%rdi
    3483:	e8 a8 db ff ff       	callq  1030 <strncmp@plt>
    3488:	85 c0                	test   %eax,%eax
    348a:	0f 85 ca 2c 00 00    	jne    615a <main+0x509a>
    3490:	48 8d 35 8e 4b 00 00 	lea    0x4b8e(%rip),%rsi        # 8025 <latmul512int>
    3497:	f3 0f 10 05 e9 8b 00 	movss  0x8be9(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    349e:	00 
    349f:	4c 89 f7             	mov    %r14,%rdi
    34a2:	e8 99 75 00 00       	callq  aa40 <measureFunction>
    34a7:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    34ad:	bf 01 00 00 00       	mov    $0x1,%edi
    34b2:	48 8d 35 37 84 00 00 	lea    0x8437(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>
    34b9:	b8 01 00 00 00       	mov    $0x1,%eax
    34be:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    34c2:	66 0f ef c0          	pxor   %xmm0,%xmm0
    34c6:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    34ca:	e8 b1 db ff ff       	callq  1080 <__printf_chk@plt>
    34cf:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    34d3:	ba 0d 00 00 00       	mov    $0xd,%edx
    34d8:	48 8d 35 6e 7c 00 00 	lea    0x7c6e(%rip),%rsi        # b14d <_IO_stdin_used+0x14d>
    34df:	4c 89 ef             	mov    %r13,%rdi
    34e2:	e8 49 db ff ff       	callq  1030 <strncmp@plt>
    34e7:	85 c0                	test   %eax,%eax
    34e9:	0f 85 0d 2c 00 00    	jne    60fc <main+0x503c>
    34ef:	48 8d 35 01 4c 00 00 	lea    0x4c01(%rip),%rsi        # 80f7 <latmuldq512int>
    34f6:	f3 0f 10 05 8a 8b 00 	movss  0x8b8a(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    34fd:	00 
    34fe:	4c 89 f7             	mov    %r14,%rdi
    3501:	e8 3a 75 00 00       	callq  aa40 <measureFunction>
    3506:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    350c:	bf 01 00 00 00       	mov    $0x1,%edi
    3511:	48 8d 35 08 84 00 00 	lea    0x8408(%rip),%rsi        # b920 <_IO_stdin_used+0x920>
    3518:	b8 01 00 00 00       	mov    $0x1,%eax
    351d:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    3521:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3525:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    3529:	e8 52 db ff ff       	callq  1080 <__printf_chk@plt>
    352e:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3532:	ba 0b 00 00 00       	mov    $0xb,%edx
    3537:	48 8d 35 1e 7c 00 00 	lea    0x7c1e(%rip),%rsi        # b15c <_IO_stdin_used+0x15c>
    353e:	4c 89 ef             	mov    %r13,%rdi
    3541:	e8 ea da ff ff       	callq  1030 <strncmp@plt>
    3546:	85 c0                	test   %eax,%eax
    3548:	0f 85 50 2b 00 00    	jne    609e <main+0x4fde>
    354e:	48 8d 35 cd 5a 00 00 	lea    0x5acd(%rip),%rsi        # 9022 <mixfmaadd512>
    3555:	f3 0f 10 05 2b 8b 00 	movss  0x8b2b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    355c:	00 
    355d:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    3562:	48 c1 e7 09          	shl    $0x9,%rdi
    3566:	e8 d5 74 00 00       	callq  aa40 <measureFunction>
    356b:	bf 01 00 00 00       	mov    $0x1,%edi
    3570:	b8 01 00 00 00       	mov    $0x1,%eax
    3575:	48 8d 35 dc 83 00 00 	lea    0x83dc(%rip),%rsi        # b958 <_IO_stdin_used+0x958>
    357c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3580:	e8 fb da ff ff       	callq  1080 <__printf_chk@plt>
    3585:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3589:	ba 0b 00 00 00       	mov    $0xb,%edx
    358e:	48 8d 35 d4 7b 00 00 	lea    0x7bd4(%rip),%rsi        # b169 <_IO_stdin_used+0x169>
    3595:	4c 89 ef             	mov    %r13,%rdi
    3598:	e8 93 da ff ff       	callq  1030 <strncmp@plt>
    359d:	85 c0                	test   %eax,%eax
    359f:	0f 85 9f 2a 00 00    	jne    6044 <main+0x4f84>
    35a5:	48 8d 35 57 5b 00 00 	lea    0x5b57(%rip),%rsi        # 9103 <mixfma512add256>
    35ac:	f3 0f 10 05 d4 8a 00 	movss  0x8ad4(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    35b3:	00 
    35b4:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    35b9:	48 c1 e7 09          	shl    $0x9,%rdi
    35bd:	e8 7e 74 00 00       	callq  aa40 <measureFunction>
    35c2:	bf 01 00 00 00       	mov    $0x1,%edi
    35c7:	b8 01 00 00 00       	mov    $0x1,%eax
    35cc:	48 8d 35 ad 83 00 00 	lea    0x83ad(%rip),%rsi        # b980 <_IO_stdin_used+0x980>
    35d3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    35d7:	e8 a4 da ff ff       	callq  1080 <__printf_chk@plt>
    35dc:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    35e0:	ba 07 00 00 00       	mov    $0x7,%edx
    35e5:	48 8d 35 8d 7b 00 00 	lea    0x7b8d(%rip),%rsi        # b179 <_IO_stdin_used+0x179>
    35ec:	4c 89 ef             	mov    %r13,%rdi
    35ef:	e8 3c da ff ff       	callq  1030 <strncmp@plt>
    35f4:	85 c0                	test   %eax,%eax
    35f6:	0f 85 ee 29 00 00    	jne    5fea <main+0x4f2a>
    35fc:	f3 0f 10 05 84 8a 00 	movss  0x8a84(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3603:	00 
    3604:	48 8d 35 a5 73 00 00 	lea    0x73a5(%rip),%rsi        # a9b0 <load512wrapper>
    360b:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3610:	e8 2b 74 00 00       	callq  aa40 <measureFunction>
    3615:	bf 01 00 00 00       	mov    $0x1,%edi
    361a:	b8 01 00 00 00       	mov    $0x1,%eax
    361f:	48 8d 35 52 7a 00 00 	lea    0x7a52(%rip),%rsi        # b078 <_IO_stdin_used+0x78>
    3626:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    362a:	e8 51 da ff ff       	callq  1080 <__printf_chk@plt>
    362f:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3633:	ba 07 00 00 00       	mov    $0x7,%edx
    3638:	48 8d 35 42 7b 00 00 	lea    0x7b42(%rip),%rsi        # b181 <_IO_stdin_used+0x181>
    363f:	4c 89 ef             	mov    %r13,%rdi
    3642:	e8 e9 d9 ff ff       	callq  1030 <strncmp@plt>
    3647:	85 c0                	test   %eax,%eax
    3649:	0f 85 41 29 00 00    	jne    5f90 <main+0x4ed0>
    364f:	f3 0f 10 05 31 8a 00 	movss  0x8a31(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3656:	00 
    3657:	48 8d 35 a2 73 00 00 	lea    0x73a2(%rip),%rsi        # aa00 <store512wrapper>
    365e:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3663:	e8 d8 73 00 00       	callq  aa40 <measureFunction>
    3668:	bf 01 00 00 00       	mov    $0x1,%edi
    366d:	b8 01 00 00 00       	mov    $0x1,%eax
    3672:	48 8d 35 1c 7a 00 00 	lea    0x7a1c(%rip),%rsi        # b095 <_IO_stdin_used+0x95>
    3679:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    367d:	e8 fe d9 ff ff       	callq  1080 <__printf_chk@plt>
    3682:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3686:	ba 09 00 00 00       	mov    $0x9,%edx
    368b:	48 8d 35 f8 7a 00 00 	lea    0x7af8(%rip),%rsi        # b18a <_IO_stdin_used+0x18a>
    3692:	4c 89 ef             	mov    %r13,%rdi
    3695:	e8 96 d9 ff ff       	callq  1030 <strncmp@plt>
    369a:	85 c0                	test   %eax,%eax
    369c:	0f 85 94 28 00 00    	jne    5f36 <main+0x4e76>
    36a2:	48 8d 35 76 4d 00 00 	lea    0x4d76(%rip),%rsi        # 841f <aesenc128>
    36a9:	f3 0f 10 05 d7 89 00 	movss  0x89d7(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    36b0:	00 
    36b1:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    36b6:	e8 85 73 00 00       	callq  aa40 <measureFunction>
    36bb:	bf 01 00 00 00       	mov    $0x1,%edi
    36c0:	b8 01 00 00 00       	mov    $0x1,%eax
    36c5:	48 8d 35 e7 79 00 00 	lea    0x79e7(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>
    36cc:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    36d0:	e8 ab d9 ff ff       	callq  1080 <__printf_chk@plt>
    36d5:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    36d9:	ba 09 00 00 00       	mov    $0x9,%edx
    36de:	48 8d 35 af 7a 00 00 	lea    0x7aaf(%rip),%rsi        # b194 <_IO_stdin_used+0x194>
    36e5:	4c 89 ef             	mov    %r13,%rdi
    36e8:	e8 43 d9 ff ff       	callq  1030 <strncmp@plt>
    36ed:	85 c0                	test   %eax,%eax
    36ef:	0f 85 e7 27 00 00    	jne    5edc <main+0x4e1c>
    36f5:	48 8d 35 87 50 00 00 	lea    0x5087(%rip),%rsi        # 8783 <aesdec128>
    36fc:	f3 0f 10 05 84 89 00 	movss  0x8984(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3703:	00 
    3704:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3709:	e8 32 73 00 00       	callq  aa40 <measureFunction>
    370e:	bf 01 00 00 00       	mov    $0x1,%edi
    3713:	b8 01 00 00 00       	mov    $0x1,%eax
    3718:	48 8d 35 aa 79 00 00 	lea    0x79aa(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>
    371f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3723:	e8 58 d9 ff ff       	callq  1080 <__printf_chk@plt>
    3728:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    372c:	ba 0c 00 00 00       	mov    $0xc,%edx
    3731:	48 8d 35 66 7a 00 00 	lea    0x7a66(%rip),%rsi        # b19e <_IO_stdin_used+0x19e>
    3738:	4c 89 ef             	mov    %r13,%rdi
    373b:	e8 f0 d8 ff ff       	callq  1030 <strncmp@plt>
    3740:	85 c0                	test   %eax,%eax
    3742:	0f 85 3a 27 00 00    	jne    5e82 <main+0x4dc2>
    3748:	48 8d 35 6a 4d 00 00 	lea    0x4d6a(%rip),%rsi        # 84b9 <aesencadd128>
    374f:	f3 0f 10 05 31 89 00 	movss  0x8931(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    3756:	00 
    3757:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    375c:	e8 df 72 00 00       	callq  aa40 <measureFunction>
    3761:	bf 01 00 00 00       	mov    $0x1,%edi
    3766:	b8 01 00 00 00       	mov    $0x1,%eax
    376b:	48 8d 35 3e 82 00 00 	lea    0x823e(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>
    3772:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3776:	e8 05 d9 ff ff       	callq  1080 <__printf_chk@plt>
    377b:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    377f:	ba 0c 00 00 00       	mov    $0xc,%edx
    3784:	48 8d 35 20 7a 00 00 	lea    0x7a20(%rip),%rsi        # b1ab <_IO_stdin_used+0x1ab>
    378b:	4c 89 ef             	mov    %r13,%rdi
    378e:	e8 9d d8 ff ff       	callq  1030 <strncmp@plt>
    3793:	85 c0                	test   %eax,%eax
    3795:	0f 85 89 26 00 00    	jne    5e24 <main+0x4d64>
    379b:	48 8d 35 d1 4d 00 00 	lea    0x4dd1(%rip),%rsi        # 8573 <aesencfma128>
    37a2:	f3 0f 10 05 de 88 00 	movss  0x88de(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    37a9:	00 
    37aa:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    37af:	e8 8c 72 00 00       	callq  aa40 <measureFunction>
    37b4:	bf 01 00 00 00       	mov    $0x1,%edi
    37b9:	b8 01 00 00 00       	mov    $0x1,%eax
    37be:	48 8d 35 1a 79 00 00 	lea    0x791a(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>
    37c5:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    37c9:	e8 b2 d8 ff ff       	callq  1080 <__printf_chk@plt>
    37ce:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    37d2:	ba 0c 00 00 00       	mov    $0xc,%edx
    37d7:	48 8d 35 da 79 00 00 	lea    0x79da(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>
    37de:	4c 89 ef             	mov    %r13,%rdi
    37e1:	e8 4a d8 ff ff       	callq  1030 <strncmp@plt>
    37e6:	85 c0                	test   %eax,%eax
    37e8:	0f 85 7d f1 ff ff    	jne    296b <main+0x18ab>
    37ee:	48 8d 35 f8 4e 00 00 	lea    0x4ef8(%rip),%rsi        # 86ed <aesencmul128>
    37f5:	f3 0f 10 05 8b 88 00 	movss  0x888b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    37fc:	00 
    37fd:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3802:	e8 39 72 00 00       	callq  aa40 <measureFunction>
    3807:	bf 01 00 00 00       	mov    $0x1,%edi
    380c:	b8 01 00 00 00       	mov    $0x1,%eax
    3811:	48 8d 35 b8 81 00 00 	lea    0x81b8(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>
    3818:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    381c:	e8 5f d8 ff ff       	callq  1080 <__printf_chk@plt>
    3821:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    3825:	ba 0c 00 00 00       	mov    $0xc,%edx
    382a:	48 8d 35 87 79 00 00 	lea    0x7987(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>
    3831:	4c 89 ef             	mov    %r13,%rdi
    3834:	e8 f7 d7 ff ff       	callq  1030 <strncmp@plt>
    3839:	85 c0                	test   %eax,%eax
    383b:	0f 85 ce 2c 00 00    	jne    650f <main+0x544f>
    3841:	f3 0f 10 35 3f 88 00 	movss  0x883f(%rip),%xmm6        # c088 <_IO_stdin_used+0x1088>
    3848:	00 
    3849:	f3 0f 11 74 24 0c    	movss  %xmm6,0xc(%rsp)
    384f:	f3 0f 11 74 24 08    	movss  %xmm6,0x8(%rsp)
    3855:	48 8d 35 d8 4d 00 00 	lea    0x4dd8(%rip),%rsi        # 8634 <aesencfadd128>
    385c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3862:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3867:	e8 d4 71 00 00       	callq  aa40 <measureFunction>
    386c:	bf 01 00 00 00       	mov    $0x1,%edi
    3871:	b8 01 00 00 00       	mov    $0x1,%eax
    3876:	48 8d 35 7b 81 00 00 	lea    0x817b(%rip),%rsi        # b9f8 <_IO_stdin_used+0x9f8>
    387d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3881:	e8 fa d7 ff ff       	callq  1080 <__printf_chk@plt>
    3886:	41 83 fc 01          	cmp    $0x1,%r12d
    388a:	0f 85 3e d9 ff ff    	jne    11ce <main+0x10e>
    3890:	48 8d 35 e1 34 00 00 	lea    0x34e1(%rip),%rsi        # 6d78 <noptest1b>
    3897:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    389d:	4c 89 f7             	mov    %r14,%rdi
    38a0:	e8 9b 71 00 00       	callq  aa40 <measureFunction>
    38a5:	bf 01 00 00 00       	mov    $0x1,%edi
    38aa:	b8 01 00 00 00       	mov    $0x1,%eax
    38af:	48 8d 35 96 7e 00 00 	lea    0x7e96(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>
    38b6:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    38ba:	e8 c1 d7 ff ff       	callq  1080 <__printf_chk@plt>
    38bf:	48 8d 35 79 34 00 00 	lea    0x3479(%rip),%rsi        # 6d3f <noptest>
    38c6:	4c 89 f7             	mov    %r14,%rdi
    38c9:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    38cf:	e8 6c 71 00 00       	callq  aa40 <measureFunction>
    38d4:	bf 01 00 00 00       	mov    $0x1,%edi
    38d9:	b8 01 00 00 00       	mov    $0x1,%eax
    38de:	48 8d 35 4c 7e 00 00 	lea    0x7e4c(%rip),%rsi        # b731 <_IO_stdin_used+0x731>
    38e5:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    38e9:	e8 92 d7 ff ff       	callq  1080 <__printf_chk@plt>
    38ee:	48 8d 35 a9 34 00 00 	lea    0x34a9(%rip),%rsi        # 6d9e <addtest>
    38f5:	4c 89 f7             	mov    %r14,%rdi
    38f8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    38fe:	e8 3d 71 00 00       	callq  aa40 <measureFunction>
    3903:	bf 01 00 00 00       	mov    $0x1,%edi
    3908:	b8 01 00 00 00       	mov    $0x1,%eax
    390d:	48 8d 35 09 7e 00 00 	lea    0x7e09(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>
    3914:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3918:	e8 63 d7 ff ff       	callq  1080 <__printf_chk@plt>
    391d:	48 8d 2d 06 35 00 00 	lea    0x3506(%rip),%rbp        # 6e2a <addnoptest>
    3924:	4c 89 f7             	mov    %r14,%rdi
    3927:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    392d:	48 89 ee             	mov    %rbp,%rsi
    3930:	e8 0b 71 00 00       	callq  aa40 <measureFunction>
    3935:	bf 01 00 00 00       	mov    $0x1,%edi
    393a:	b8 01 00 00 00       	mov    $0x1,%eax
    393f:	48 8d 35 ba 7d 00 00 	lea    0x7dba(%rip),%rsi        # b700 <_IO_stdin_used+0x700>
    3946:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    394a:	e8 31 d7 ff ff       	callq  1080 <__printf_chk@plt>
    394f:	48 89 ee             	mov    %rbp,%rsi
    3952:	4c 89 f7             	mov    %r14,%rdi
    3955:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    395b:	e8 e0 70 00 00       	callq  aa40 <measureFunction>
    3960:	bf 01 00 00 00       	mov    $0x1,%edi
    3965:	b8 01 00 00 00       	mov    $0x1,%eax
    396a:	48 8d 35 72 7d 00 00 	lea    0x7d72(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>
    3971:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3975:	e8 06 d7 ff ff       	callq  1080 <__printf_chk@plt>
    397a:	48 8d 35 c6 6b 00 00 	lea    0x6bc6(%rip),%rsi        # a547 <depmovtest>
    3981:	4c 89 f7             	mov    %r14,%rdi
    3984:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    398a:	e8 b1 70 00 00       	callq  aa40 <measureFunction>
    398f:	bf 01 00 00 00       	mov    $0x1,%edi
    3994:	b8 01 00 00 00       	mov    $0x1,%eax
    3999:	48 8d 35 25 7d 00 00 	lea    0x7d25(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>
    39a0:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    39a4:	e8 d7 d6 ff ff       	callq  1080 <__printf_chk@plt>
    39a9:	48 8d 35 0c 6c 00 00 	lea    0x6c0c(%rip),%rsi        # a5bc <indepmovtest>
    39b0:	4c 89 f7             	mov    %r14,%rdi
    39b3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    39b9:	e8 82 70 00 00       	callq  aa40 <measureFunction>
    39be:	bf 01 00 00 00       	mov    $0x1,%edi
    39c3:	b8 01 00 00 00       	mov    $0x1,%eax
    39c8:	48 8d 35 59 86 00 00 	lea    0x8659(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>
    39cf:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    39d3:	e8 a8 d6 ff ff       	callq  1080 <__printf_chk@plt>
    39d8:	48 8d 35 1f 6d 00 00 	lea    0x6d1f(%rip),%rsi        # a6fe <xorzerotest>
    39df:	4c 89 f7             	mov    %r14,%rdi
    39e2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    39e8:	e8 53 70 00 00       	callq  aa40 <measureFunction>
    39ed:	bf 01 00 00 00       	mov    $0x1,%edi
    39f2:	b8 01 00 00 00       	mov    $0x1,%eax
    39f7:	48 8d 35 af 7c 00 00 	lea    0x7caf(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>
    39fe:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3a02:	e8 79 d6 ff ff       	callq  1080 <__printf_chk@plt>
    3a07:	48 8d 35 25 6c 00 00 	lea    0x6c25(%rip),%rsi        # a633 <movzerotest>
    3a0e:	4c 89 f7             	mov    %r14,%rdi
    3a11:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3a17:	e8 24 70 00 00       	callq  aa40 <measureFunction>
    3a1c:	bf 01 00 00 00       	mov    $0x1,%edi
    3a21:	b8 01 00 00 00       	mov    $0x1,%eax
    3a26:	48 8d 35 68 7c 00 00 	lea    0x7c68(%rip),%rsi        # b695 <_IO_stdin_used+0x695>
    3a2d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3a31:	e8 4a d6 ff ff       	callq  1080 <__printf_chk@plt>
    3a36:	48 8d 35 38 6d 00 00 	lea    0x6d38(%rip),%rsi        # a775 <subzerotest>
    3a3d:	4c 89 f7             	mov    %r14,%rdi
    3a40:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3a46:	e8 f5 6f 00 00       	callq  aa40 <measureFunction>
    3a4b:	bf 01 00 00 00       	mov    $0x1,%edi
    3a50:	b8 01 00 00 00       	mov    $0x1,%eax
    3a55:	48 8d 35 21 7c 00 00 	lea    0x7c21(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>
    3a5c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3a60:	e8 1b d6 ff ff       	callq  1080 <__printf_chk@plt>
    3a65:	48 8d 35 0b 6e 00 00 	lea    0x6e0b(%rip),%rsi        # a877 <depinctest>
    3a6c:	4c 89 f7             	mov    %r14,%rdi
    3a6f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3a75:	e8 c6 6f 00 00       	callq  aa40 <measureFunction>
    3a7a:	bf 01 00 00 00       	mov    $0x1,%edi
    3a7f:	b8 01 00 00 00       	mov    $0x1,%eax
    3a84:	48 8d 35 db 7b 00 00 	lea    0x7bdb(%rip),%rsi        # b666 <_IO_stdin_used+0x666>
    3a8b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3a8f:	e8 ec d5 ff ff       	callq  1080 <__printf_chk@plt>
    3a94:	48 8d 35 53 6e 00 00 	lea    0x6e53(%rip),%rsi        # a8ee <depdectest>
    3a9b:	4c 89 f7             	mov    %r14,%rdi
    3a9e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3aa4:	e8 97 6f 00 00       	callq  aa40 <measureFunction>
    3aa9:	bf 01 00 00 00       	mov    $0x1,%edi
    3aae:	b8 01 00 00 00       	mov    $0x1,%eax
    3ab3:	48 8d 35 95 7b 00 00 	lea    0x7b95(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>
    3aba:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3abe:	e8 bd d5 ff ff       	callq  1080 <__printf_chk@plt>
    3ac3:	48 8d 35 22 6d 00 00 	lea    0x6d22(%rip),%rsi        # a7ec <depaddimmtest>
    3aca:	4c 89 f7             	mov    %r14,%rdi
    3acd:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3ad3:	e8 68 6f 00 00       	callq  aa40 <measureFunction>
    3ad8:	bf 01 00 00 00       	mov    $0x1,%edi
    3add:	b8 01 00 00 00       	mov    $0x1,%eax
    3ae2:	48 8d 35 17 85 00 00 	lea    0x8517(%rip),%rsi        # c000 <_IO_stdin_used+0x1000>
    3ae9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3aed:	e8 8e d5 ff ff       	callq  1080 <__printf_chk@plt>
    3af2:	48 8d 35 ad 31 00 00 	lea    0x31ad(%rip),%rsi        # 6ca6 <clkmovtest>
    3af9:	4c 89 f7             	mov    %r14,%rdi
    3afc:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3b02:	e8 39 6f 00 00       	callq  aa40 <measureFunction>
    3b07:	bf 01 00 00 00       	mov    $0x1,%edi
    3b0c:	b8 01 00 00 00       	mov    $0x1,%eax
    3b11:	48 8d 35 c0 84 00 00 	lea    0x84c0(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>
    3b18:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3b1c:	e8 5f d5 ff ff       	callq  1080 <__printf_chk@plt>
    3b21:	48 8d 35 7c 3c 00 00 	lea    0x3c7c(%rip),%rsi        # 77a4 <addmultest>
    3b28:	4c 89 f7             	mov    %r14,%rdi
    3b2b:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3b31:	e8 0a 6f 00 00       	callq  aa40 <measureFunction>
    3b36:	bf 01 00 00 00       	mov    $0x1,%edi
    3b3b:	b8 01 00 00 00       	mov    $0x1,%eax
    3b40:	48 8d 35 eb 7a 00 00 	lea    0x7aeb(%rip),%rsi        # b632 <_IO_stdin_used+0x632>
    3b47:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3b4b:	e8 30 d5 ff ff       	callq  1080 <__printf_chk@plt>
    3b50:	48 8d 35 90 3b 00 00 	lea    0x3b90(%rip),%rsi        # 76e7 <jmpmultest>
    3b57:	4c 89 f7             	mov    %r14,%rdi
    3b5a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3b60:	e8 db 6e 00 00       	callq  aa40 <measureFunction>
    3b65:	bf 01 00 00 00       	mov    $0x1,%edi
    3b6a:	b8 01 00 00 00       	mov    $0x1,%eax
    3b6f:	48 8d 35 a1 7a 00 00 	lea    0x7aa1(%rip),%rsi        # b617 <_IO_stdin_used+0x617>
    3b76:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3b7a:	e8 01 d5 ff ff       	callq  1080 <__printf_chk@plt>
    3b7f:	48 8d 35 6b 39 00 00 	lea    0x396b(%rip),%rsi        # 74f1 <jmptest>
    3b86:	4c 89 f7             	mov    %r14,%rdi
    3b89:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3b8f:	e8 ac 6e 00 00       	callq  aa40 <measureFunction>
    3b94:	bf 01 00 00 00       	mov    $0x1,%edi
    3b99:	b8 01 00 00 00       	mov    $0x1,%eax
    3b9e:	48 8d 35 59 7a 00 00 	lea    0x7a59(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>
    3ba5:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3ba9:	e8 d2 d4 ff ff       	callq  1080 <__printf_chk@plt>
    3bae:	48 8d 35 14 3a 00 00 	lea    0x3a14(%rip),%rsi        # 75c9 <ntjmptest>
    3bb5:	4c 89 f7             	mov    %r14,%rdi
    3bb8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3bbe:	e8 7d 6e 00 00       	callq  aa40 <measureFunction>
    3bc3:	bf 01 00 00 00       	mov    $0x1,%edi
    3bc8:	b8 01 00 00 00       	mov    $0x1,%eax
    3bcd:	48 8d 35 14 7a 00 00 	lea    0x7a14(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>
    3bd4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3bd8:	e8 a3 d4 ff ff       	callq  1080 <__printf_chk@plt>
    3bdd:	48 8d 35 4c 67 00 00 	lea    0x674c(%rip),%rsi        # a330 <pdeptest>
    3be4:	4c 89 f7             	mov    %r14,%rdi
    3be7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3bed:	e8 4e 6e 00 00       	callq  aa40 <measureFunction>
    3bf2:	bf 01 00 00 00       	mov    $0x1,%edi
    3bf7:	b8 01 00 00 00       	mov    $0x1,%eax
    3bfc:	48 8d 35 d1 79 00 00 	lea    0x79d1(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>
    3c03:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3c07:	e8 74 d4 ff ff       	callq  1080 <__printf_chk@plt>
    3c0c:	48 8d 35 80 68 00 00 	lea    0x6880(%rip),%rsi        # a493 <pexttest>
    3c13:	4c 89 f7             	mov    %r14,%rdi
    3c16:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3c1c:	e8 1f 6e 00 00       	callq  aa40 <measureFunction>
    3c21:	bf 01 00 00 00       	mov    $0x1,%edi
    3c26:	b8 01 00 00 00       	mov    $0x1,%eax
    3c2b:	48 8d 35 8e 79 00 00 	lea    0x798e(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>
    3c32:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3c36:	e8 45 d4 ff ff       	callq  1080 <__printf_chk@plt>
    3c3b:	48 8d 35 a2 67 00 00 	lea    0x67a2(%rip),%rsi        # a3e4 <pdepmultest>
    3c42:	4c 89 f7             	mov    %r14,%rdi
    3c45:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3c4b:	e8 f0 6d 00 00       	callq  aa40 <measureFunction>
    3c50:	bf 01 00 00 00       	mov    $0x1,%edi
    3c55:	b8 01 00 00 00       	mov    $0x1,%eax
    3c5a:	48 8d 35 43 79 00 00 	lea    0x7943(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>
    3c61:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3c65:	e8 16 d4 ff ff       	callq  1080 <__printf_chk@plt>
    3c6a:	48 8d 35 55 33 00 00 	lea    0x3355(%rip),%rsi        # 6fc6 <shltest>
    3c71:	4c 89 f7             	mov    %r14,%rdi
    3c74:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3c7a:	e8 c1 6d 00 00       	callq  aa40 <measureFunction>
    3c7f:	bf 01 00 00 00       	mov    $0x1,%edi
    3c84:	b8 01 00 00 00       	mov    $0x1,%eax
    3c89:	48 8d 35 fd 78 00 00 	lea    0x78fd(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>
    3c90:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3c94:	e8 e7 d3 ff ff       	callq  1080 <__printf_chk@plt>
    3c99:	48 8d 35 9a 32 00 00 	lea    0x329a(%rip),%rsi        # 6f3a <rortest>
    3ca0:	4c 89 f7             	mov    %r14,%rdi
    3ca3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3ca9:	e8 92 6d 00 00       	callq  aa40 <measureFunction>
    3cae:	bf 01 00 00 00       	mov    $0x1,%edi
    3cb3:	b8 01 00 00 00       	mov    $0x1,%eax
    3cb8:	48 8d 35 b7 78 00 00 	lea    0x78b7(%rip),%rsi        # b576 <_IO_stdin_used+0x576>
    3cbf:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3cc3:	e8 b8 d3 ff ff       	callq  1080 <__printf_chk@plt>
    3cc8:	48 8d 35 83 33 00 00 	lea    0x3383(%rip),%rsi        # 7052 <mixrorshltest>
    3ccf:	4c 89 f7             	mov    %r14,%rdi
    3cd2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3cd8:	e8 63 6d 00 00       	callq  aa40 <measureFunction>
    3cdd:	bf 01 00 00 00       	mov    $0x1,%edi
    3ce2:	b8 01 00 00 00       	mov    $0x1,%eax
    3ce7:	48 8d 35 ca 82 00 00 	lea    0x82ca(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>
    3cee:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3cf2:	e8 89 d3 ff ff       	callq  1080 <__printf_chk@plt>
    3cf7:	48 8d 35 e0 33 00 00 	lea    0x33e0(%rip),%rsi        # 70de <mixrormultest>
    3cfe:	4c 89 f7             	mov    %r14,%rdi
    3d01:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3d07:	e8 34 6d 00 00       	callq  aa40 <measureFunction>
    3d0c:	bf 01 00 00 00       	mov    $0x1,%edi
    3d11:	b8 01 00 00 00       	mov    $0x1,%eax
    3d16:	48 8d 35 3e 78 00 00 	lea    0x783e(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>
    3d1d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3d21:	e8 5a d3 ff ff       	callq  1080 <__printf_chk@plt>
    3d26:	48 8d 35 03 35 00 00 	lea    0x3503(%rip),%rsi        # 7230 <btstest>
    3d2d:	4c 89 f7             	mov    %r14,%rdi
    3d30:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3d36:	e8 05 6d 00 00       	callq  aa40 <measureFunction>
    3d3b:	bf 01 00 00 00       	mov    $0x1,%edi
    3d40:	b8 01 00 00 00       	mov    $0x1,%eax
    3d45:	48 8d 35 fc 77 00 00 	lea    0x77fc(%rip),%rsi        # b548 <_IO_stdin_used+0x548>
    3d4c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3d50:	e8 2b d3 ff ff       	callq  1080 <__printf_chk@plt>
    3d55:	48 8d 35 cd 36 00 00 	lea    0x36cd(%rip),%rsi        # 7429 <btsmultest>
    3d5c:	4c 89 f7             	mov    %r14,%rdi
    3d5f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3d65:	e8 d6 6c 00 00       	callq  aa40 <measureFunction>
    3d6a:	bf 01 00 00 00       	mov    $0x1,%edi
    3d6f:	b8 01 00 00 00       	mov    $0x1,%eax
    3d74:	48 8d 35 b2 77 00 00 	lea    0x77b2(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>
    3d7b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3d7f:	e8 fc d2 ff ff       	callq  1080 <__printf_chk@plt>
    3d84:	48 8d 35 08 34 00 00 	lea    0x3408(%rip),%rsi        # 7193 <rorbtstest>
    3d8b:	4c 89 f7             	mov    %r14,%rdi
    3d8e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3d94:	e8 a7 6c 00 00       	callq  aa40 <measureFunction>
    3d99:	bf 01 00 00 00       	mov    $0x1,%edi
    3d9e:	b8 01 00 00 00       	mov    $0x1,%eax
    3da3:	48 8d 35 68 77 00 00 	lea    0x7768(%rip),%rsi        # b512 <_IO_stdin_used+0x512>
    3daa:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3dae:	e8 cd d2 ff ff       	callq  1080 <__printf_chk@plt>
    3db3:	48 8d 35 19 35 00 00 	lea    0x3519(%rip),%rsi        # 72d3 <leatest>
    3dba:	4c 89 f7             	mov    %r14,%rdi
    3dbd:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3dc3:	e8 78 6c 00 00       	callq  aa40 <measureFunction>
    3dc8:	bf 01 00 00 00       	mov    $0x1,%edi
    3dcd:	b8 01 00 00 00       	mov    $0x1,%eax
    3dd2:	48 8d 35 20 77 00 00 	lea    0x7720(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>
    3dd9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3ddd:	e8 9e d2 ff ff       	callq  1080 <__printf_chk@plt>
    3de2:	48 8d 35 8d 35 00 00 	lea    0x358d(%rip),%rsi        # 7376 <leamultest>
    3de9:	4c 89 f7             	mov    %r14,%rdi
    3dec:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3df2:	e8 49 6c 00 00       	callq  aa40 <measureFunction>
    3df7:	bf 01 00 00 00       	mov    $0x1,%edi
    3dfc:	b8 01 00 00 00       	mov    $0x1,%eax
    3e01:	48 8d 35 88 81 00 00 	lea    0x8188(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>
    3e08:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3e0c:	e8 6f d2 ff ff       	callq  1080 <__printf_chk@plt>
    3e11:	48 8d 35 73 3a 00 00 	lea    0x3a73(%rip),%rsi        # 788b <add256int>
    3e18:	4c 89 f7             	mov    %r14,%rdi
    3e1b:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3e21:	e8 1a 6c 00 00       	callq  aa40 <measureFunction>
    3e26:	bf 01 00 00 00       	mov    $0x1,%edi
    3e2b:	b8 01 00 00 00       	mov    $0x1,%eax
    3e30:	48 8d 35 31 81 00 00 	lea    0x8131(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>
    3e37:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3e3b:	e8 40 d2 ff ff       	callq  1080 <__printf_chk@plt>
    3e40:	48 8d 35 fd 3e 00 00 	lea    0x3efd(%rip),%rsi        # 7d44 <mixadd256int>
    3e47:	4c 89 f7             	mov    %r14,%rdi
    3e4a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3e50:	e8 eb 6b 00 00       	callq  aa40 <measureFunction>
    3e55:	bf 01 00 00 00       	mov    $0x1,%edi
    3e5a:	b8 01 00 00 00       	mov    $0x1,%eax
    3e5f:	48 8d 35 ca 80 00 00 	lea    0x80ca(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>
    3e66:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3e6a:	e8 11 d2 ff ff       	callq  1080 <__printf_chk@plt>
    3e6f:	48 8d 35 93 3f 00 00 	lea    0x3f93(%rip),%rsi        # 7e09 <mixadd256int11>
    3e76:	4c 89 f7             	mov    %r14,%rdi
    3e79:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3e7f:	e8 bc 6b 00 00       	callq  aa40 <measureFunction>
    3e84:	bf 01 00 00 00       	mov    $0x1,%edi
    3e89:	b8 01 00 00 00       	mov    $0x1,%eax
    3e8e:	48 8d 35 63 80 00 00 	lea    0x8063(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>
    3e95:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3e99:	e8 e2 d1 ff ff       	callq  1080 <__printf_chk@plt>
    3e9e:	48 8d 35 a6 3c 00 00 	lea    0x3ca6(%rip),%rsi        # 7b4b <mixadd256fpint>
    3ea5:	4c 89 f7             	mov    %r14,%rdi
    3ea8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3eae:	e8 8d 6b 00 00       	callq  aa40 <measureFunction>
    3eb3:	bf 01 00 00 00       	mov    $0x1,%edi
    3eb8:	b8 01 00 00 00       	mov    $0x1,%eax
    3ebd:	48 8d 35 04 80 00 00 	lea    0x8004(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>
    3ec4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3ec8:	e8 b3 d1 ff ff       	callq  1080 <__printf_chk@plt>
    3ecd:	48 8d 35 cc 3d 00 00 	lea    0x3dcc(%rip),%rsi        # 7ca0 <mix256fp>
    3ed4:	4c 89 f7             	mov    %r14,%rdi
    3ed7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3edd:	e8 5e 6b 00 00       	callq  aa40 <measureFunction>
    3ee2:	bf 01 00 00 00       	mov    $0x1,%edi
    3ee7:	b8 01 00 00 00       	mov    $0x1,%eax
    3eec:	48 8d 35 a5 7f 00 00 	lea    0x7fa5(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>
    3ef3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    3ef7:	e8 84 d1 ff ff       	callq  1080 <__printf_chk@plt>
    3efc:	48 8d 35 ad 3f 00 00 	lea    0x3fad(%rip),%rsi        # 7eb0 <latadd256int>
    3f03:	4c 89 f7             	mov    %r14,%rdi
    3f06:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3f0c:	e8 2f 6b 00 00       	callq  aa40 <measureFunction>
    3f11:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    3f17:	bf 01 00 00 00       	mov    $0x1,%edi
    3f1c:	48 8d 35 45 7f 00 00 	lea    0x7f45(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>
    3f23:	b8 01 00 00 00       	mov    $0x1,%eax
    3f28:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    3f2c:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3f30:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    3f34:	e8 47 d1 ff ff       	callq  1080 <__printf_chk@plt>
    3f39:	48 8d 35 5b 43 00 00 	lea    0x435b(%rip),%rsi        # 829b <latmul256int>
    3f40:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3f46:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3f4b:	e8 f0 6a 00 00       	callq  aa40 <measureFunction>
    3f50:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    3f56:	bf 01 00 00 00       	mov    $0x1,%edi
    3f5b:	48 8d 35 ce 7e 00 00 	lea    0x7ece(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>
    3f62:	b8 01 00 00 00       	mov    $0x1,%eax
    3f67:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    3f6b:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3f6f:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    3f73:	e8 08 d1 ff ff       	callq  1080 <__printf_chk@plt>
    3f78:	48 8d 35 ca 43 00 00 	lea    0x43ca(%rip),%rsi        # 8349 <latadd128int>
    3f7f:	4c 89 f7             	mov    %r14,%rdi
    3f82:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3f88:	e8 b3 6a 00 00       	callq  aa40 <measureFunction>
    3f8d:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    3f93:	bf 01 00 00 00       	mov    $0x1,%edi
    3f98:	48 8d 35 61 7e 00 00 	lea    0x7e61(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>
    3f9f:	b8 01 00 00 00       	mov    $0x1,%eax
    3fa4:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    3fa8:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3fac:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    3fb0:	e8 cb d0 ff ff       	callq  1080 <__printf_chk@plt>
    3fb5:	48 8d 35 e0 48 00 00 	lea    0x48e0(%rip),%rsi        # 889c <latmul128int>
    3fbc:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    3fc2:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    3fc7:	e8 74 6a 00 00       	callq  aa40 <measureFunction>
    3fcc:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    3fd2:	bf 01 00 00 00       	mov    $0x1,%edi
    3fd7:	48 8d 35 ea 7d 00 00 	lea    0x7dea(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>
    3fde:	b8 01 00 00 00       	mov    $0x1,%eax
    3fe3:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    3fe7:	66 0f ef c0          	pxor   %xmm0,%xmm0
    3feb:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    3fef:	e8 8c d0 ff ff       	callq  1080 <__printf_chk@plt>
    3ff4:	48 8d 35 c6 49 00 00 	lea    0x49c6(%rip),%rsi        # 89c1 <latadd256fp>
    3ffb:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4001:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4006:	e8 35 6a 00 00       	callq  aa40 <measureFunction>
    400b:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    4011:	bf 01 00 00 00       	mov    $0x1,%edi
    4016:	48 8d 35 83 7d 00 00 	lea    0x7d83(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>
    401d:	b8 01 00 00 00       	mov    $0x1,%eax
    4022:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    4026:	66 0f ef c0          	pxor   %xmm0,%xmm0
    402a:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    402e:	e8 4d d0 ff ff       	callq  1080 <__printf_chk@plt>
    4033:	48 8d 35 41 4b 00 00 	lea    0x4b41(%rip),%rsi        # 8b7b <latmul256fp>
    403a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4040:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4045:	e8 f6 69 00 00       	callq  aa40 <measureFunction>
    404a:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    4050:	bf 01 00 00 00       	mov    $0x1,%edi
    4055:	48 8d 35 1c 7d 00 00 	lea    0x7d1c(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>
    405c:	b8 01 00 00 00       	mov    $0x1,%eax
    4061:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    4065:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4069:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    406d:	e8 0e d0 ff ff       	callq  1080 <__printf_chk@plt>
    4072:	48 8d 35 13 58 00 00 	lea    0x5813(%rip),%rsi        # 988c <latadd128fp>
    4079:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    407f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4084:	e8 b7 69 00 00       	callq  aa40 <measureFunction>
    4089:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    408f:	bf 01 00 00 00       	mov    $0x1,%edi
    4094:	48 8d 35 b5 7c 00 00 	lea    0x7cb5(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>
    409b:	b8 01 00 00 00       	mov    $0x1,%eax
    40a0:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    40a4:	66 0f ef c0          	pxor   %xmm0,%xmm0
    40a8:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    40ac:	e8 cf cf ff ff       	callq  1080 <__printf_chk@plt>
    40b1:	48 8d 35 34 58 00 00 	lea    0x5834(%rip),%rsi        # 98ec <latmul128fp>
    40b8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    40be:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    40c3:	e8 78 69 00 00       	callq  aa40 <measureFunction>
    40c8:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    40ce:	bf 01 00 00 00       	mov    $0x1,%edi
    40d3:	48 8d 35 4e 7c 00 00 	lea    0x7c4e(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>
    40da:	b8 01 00 00 00       	mov    $0x1,%eax
    40df:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    40e3:	66 0f ef c0          	pxor   %xmm0,%xmm0
    40e7:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    40eb:	e8 90 cf ff ff       	callq  1080 <__printf_chk@plt>
    40f0:	48 8d 35 c4 58 00 00 	lea    0x58c4(%rip),%rsi        # 99bb <add128fp>
    40f7:	4c 89 f7             	mov    %r14,%rdi
    40fa:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4100:	e8 3b 69 00 00       	callq  aa40 <measureFunction>
    4105:	bf 01 00 00 00       	mov    $0x1,%edi
    410a:	b8 01 00 00 00       	mov    $0x1,%eax
    410f:	48 8d 35 f2 7b 00 00 	lea    0x7bf2(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>
    4116:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    411a:	e8 61 cf ff ff       	callq  1080 <__printf_chk@plt>
    411f:	48 8d 35 26 58 00 00 	lea    0x5826(%rip),%rsi        # 994c <mul128fp>
    4126:	4c 89 f7             	mov    %r14,%rdi
    4129:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    412f:	e8 0c 69 00 00       	callq  aa40 <measureFunction>
    4134:	bf 01 00 00 00       	mov    $0x1,%edi
    4139:	b8 01 00 00 00       	mov    $0x1,%eax
    413e:	48 8d 35 a3 7b 00 00 	lea    0x7ba3(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>
    4145:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4149:	e8 32 cf ff ff       	callq  1080 <__printf_chk@plt>
    414e:	48 8d 35 5f 42 00 00 	lea    0x425f(%rip),%rsi        # 83b4 <add128int>
    4155:	4c 89 f7             	mov    %r14,%rdi
    4158:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    415e:	e8 dd 68 00 00       	callq  aa40 <measureFunction>
    4163:	bf 01 00 00 00       	mov    $0x1,%edi
    4168:	b8 01 00 00 00       	mov    $0x1,%eax
    416d:	48 8d 35 4c 7b 00 00 	lea    0x7b4c(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>
    4174:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4178:	e8 03 cf ff ff       	callq  1080 <__printf_chk@plt>
    417d:	48 8d 35 99 46 00 00 	lea    0x4699(%rip),%rsi        # 881d <mul128int>
    4184:	4c 89 f7             	mov    %r14,%rdi
    4187:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    418d:	e8 ae 68 00 00       	callq  aa40 <measureFunction>
    4192:	bf 01 00 00 00       	mov    $0x1,%edi
    4197:	b8 01 00 00 00       	mov    $0x1,%eax
    419c:	48 8d 35 f5 7a 00 00 	lea    0x7af5(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>
    41a3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    41a7:	e8 d4 ce ff ff       	callq  1080 <__printf_chk@plt>
    41ac:	48 8d 35 fc 4b 00 00 	lea    0x4bfc(%rip),%rsi        # 8daf <fma256>
    41b3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    41b9:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    41be:	e8 7d 68 00 00       	callq  aa40 <measureFunction>
    41c3:	bf 01 00 00 00       	mov    $0x1,%edi
    41c8:	b8 01 00 00 00       	mov    $0x1,%eax
    41cd:	48 8d 35 0a 73 00 00 	lea    0x730a(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>
    41d4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    41d8:	e8 a3 ce ff ff       	callq  1080 <__printf_chk@plt>
    41dd:	48 8d 35 83 4c 00 00 	lea    0x4c83(%rip),%rsi        # 8e67 <fma128>
    41e4:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    41ea:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    41ef:	e8 4c 68 00 00       	callq  aa40 <measureFunction>
    41f4:	bf 01 00 00 00       	mov    $0x1,%edi
    41f9:	b8 01 00 00 00       	mov    $0x1,%eax
    41fe:	48 8d 35 be 72 00 00 	lea    0x72be(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>
    4205:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4209:	e8 72 ce ff ff       	callq  1080 <__printf_chk@plt>
    420e:	48 8d 35 04 55 00 00 	lea    0x5504(%rip),%rsi        # 9719 <latfma256>
    4215:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    421b:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4220:	e8 1b 68 00 00       	callq  aa40 <measureFunction>
    4225:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    422b:	bf 01 00 00 00       	mov    $0x1,%edi
    4230:	48 8d 35 39 7a 00 00 	lea    0x7a39(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>
    4237:	b8 01 00 00 00       	mov    $0x1,%eax
    423c:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    4240:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4244:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    4248:	e8 33 ce ff ff       	callq  1080 <__printf_chk@plt>
    424d:	48 8d 35 7d 55 00 00 	lea    0x557d(%rip),%rsi        # 97d1 <latfma128>
    4254:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    425a:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    425f:	e8 dc 67 00 00       	callq  aa40 <measureFunction>
    4264:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    426a:	bf 01 00 00 00       	mov    $0x1,%edi
    426f:	48 8d 35 d2 79 00 00 	lea    0x79d2(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>
    4276:	b8 01 00 00 00       	mov    $0x1,%eax
    427b:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    427f:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4283:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    4287:	e8 f4 cd ff ff       	callq  1080 <__printf_chk@plt>
    428c:	48 8d 35 49 48 00 00 	lea    0x4849(%rip),%rsi        # 8adc <add256fp>
    4293:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4299:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    429e:	e8 9d 67 00 00       	callq  aa40 <measureFunction>
    42a3:	bf 01 00 00 00       	mov    $0x1,%edi
    42a8:	b8 01 00 00 00       	mov    $0x1,%eax
    42ad:	48 8d 35 f3 71 00 00 	lea    0x71f3(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>
    42b4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    42b8:	e8 c3 cd ff ff       	callq  1080 <__printf_chk@plt>
    42bd:	48 8d 35 79 47 00 00 	lea    0x4779(%rip),%rsi        # 8a3d <mul256fp>
    42c4:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    42ca:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    42cf:	e8 6c 67 00 00       	callq  aa40 <measureFunction>
    42d4:	bf 01 00 00 00       	mov    $0x1,%edi
    42d9:	b8 01 00 00 00       	mov    $0x1,%eax
    42de:	48 8d 35 a6 71 00 00 	lea    0x71a6(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>
    42e5:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    42e9:	e8 92 cd ff ff       	callq  1080 <__printf_chk@plt>
    42ee:	48 8d 35 2d 4c 00 00 	lea    0x4c2d(%rip),%rsi        # 8f22 <mixfmafadd256>
    42f5:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    42fb:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4300:	48 c1 e7 09          	shl    $0x9,%rdi
    4304:	e8 37 67 00 00       	callq  aa40 <measureFunction>
    4309:	bf 01 00 00 00       	mov    $0x1,%edi
    430e:	b8 01 00 00 00       	mov    $0x1,%eax
    4313:	48 8d 35 06 79 00 00 	lea    0x7906(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>
    431a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    431e:	e8 5d cd ff ff       	callq  1080 <__printf_chk@plt>
    4323:	48 8d 35 a7 4e 00 00 	lea    0x4ea7(%rip),%rsi        # 91d1 <mixfmaadd256>
    432a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4330:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4335:	48 c1 e7 09          	shl    $0x9,%rdi
    4339:	e8 02 67 00 00       	callq  aa40 <measureFunction>
    433e:	bf 01 00 00 00       	mov    $0x1,%edi
    4343:	b8 01 00 00 00       	mov    $0x1,%eax
    4348:	48 8d 35 a9 78 00 00 	lea    0x78a9(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>
    434f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4353:	e8 28 cd ff ff       	callq  1080 <__printf_chk@plt>
    4358:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    435e:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4363:	48 c1 e7 09          	shl    $0x9,%rdi
    4367:	48 8d 35 c2 66 00 00 	lea    0x66c2(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>
    436e:	e8 cd 66 00 00       	callq  aa40 <measureFunction>
    4373:	bf 01 00 00 00       	mov    $0x1,%edi
    4378:	b8 01 00 00 00       	mov    $0x1,%eax
    437d:	48 8d 35 44 78 00 00 	lea    0x7844(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>
    4384:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4388:	e8 f3 cc ff ff       	callq  1080 <__printf_chk@plt>
    438d:	48 8d 35 ee 4e 00 00 	lea    0x4eee(%rip),%rsi        # 9282 <mixfmaand256>
    4394:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    439a:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    439f:	48 c1 e7 09          	shl    $0x9,%rdi
    43a3:	e8 98 66 00 00       	callq  aa40 <measureFunction>
    43a8:	bf 01 00 00 00       	mov    $0x1,%edi
    43ad:	b8 01 00 00 00       	mov    $0x1,%eax
    43b2:	48 8d 35 e7 77 00 00 	lea    0x77e7(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>
    43b9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    43bd:	e8 be cc ff ff       	callq  1080 <__printf_chk@plt>
    43c2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    43c8:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    43cd:	48 c1 e7 09          	shl    $0x9,%rdi
    43d1:	48 8d 35 48 66 00 00 	lea    0x6648(%rip),%rsi        # aa20 <mixfmaandmem256wrapper>
    43d8:	e8 63 66 00 00       	callq  aa40 <measureFunction>
    43dd:	bf 01 00 00 00       	mov    $0x1,%edi
    43e2:	b8 01 00 00 00       	mov    $0x1,%eax
    43e7:	48 8d 35 5a 7c 00 00 	lea    0x7c5a(%rip),%rsi        # c048 <_IO_stdin_used+0x1048>
    43ee:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    43f2:	e8 89 cc ff ff       	callq  1080 <__printf_chk@plt>
    43f7:	48 8d 35 9a 51 00 00 	lea    0x519a(%rip),%rsi        # 9598 <nemesfpumix21>
    43fe:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4404:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4409:	48 c1 e7 09          	shl    $0x9,%rdi
    440d:	e8 2e 66 00 00       	callq  aa40 <measureFunction>
    4412:	bf 01 00 00 00       	mov    $0x1,%edi
    4417:	b8 01 00 00 00       	mov    $0x1,%eax
    441c:	48 8d 35 4d 77 00 00 	lea    0x774d(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>
    4423:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4427:	e8 54 cc ff ff       	callq  1080 <__printf_chk@plt>
    442c:	48 8d 35 c1 37 00 00 	lea    0x37c1(%rip),%rsi        # 7bf4 <mix256faddintadd>
    4433:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4439:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    443e:	e8 fd 65 00 00       	callq  aa40 <measureFunction>
    4443:	bf 01 00 00 00       	mov    $0x1,%edi
    4448:	b8 01 00 00 00       	mov    $0x1,%eax
    444d:	48 8d 35 f4 76 00 00 	lea    0x76f4(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>
    4454:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4458:	e8 23 cc ff ff       	callq  1080 <__printf_chk@plt>
    445d:	48 8d 35 66 56 00 00 	lea    0x5666(%rip),%rsi        # 9aca <latmul16>
    4464:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    446a:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    446f:	e8 cc 65 00 00       	callq  aa40 <measureFunction>
    4474:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    447a:	bf 01 00 00 00       	mov    $0x1,%edi
    447f:	48 8d 35 9a 76 00 00 	lea    0x769a(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>
    4486:	b8 01 00 00 00       	mov    $0x1,%eax
    448b:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    448f:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4493:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    4497:	e8 e4 cb ff ff       	callq  1080 <__printf_chk@plt>
    449c:	48 8d 35 87 55 00 00 	lea    0x5587(%rip),%rsi        # 9a2a <latmul64>
    44a3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    44a9:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    44ae:	e8 8d 65 00 00       	callq  aa40 <measureFunction>
    44b3:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    44b9:	bf 01 00 00 00       	mov    $0x1,%edi
    44be:	48 8d 35 33 76 00 00 	lea    0x7633(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>
    44c5:	b8 01 00 00 00       	mov    $0x1,%eax
    44ca:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    44ce:	66 0f ef c0          	pxor   %xmm0,%xmm0
    44d2:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    44d6:	e8 a5 cb ff ff       	callq  1080 <__printf_chk@plt>
    44db:	48 8d 35 9c 56 00 00 	lea    0x569c(%rip),%rsi        # 9b7e <mul16>
    44e2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    44e8:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    44ed:	e8 4e 65 00 00       	callq  aa40 <measureFunction>
    44f2:	bf 01 00 00 00       	mov    $0x1,%edi
    44f7:	b8 01 00 00 00       	mov    $0x1,%eax
    44fc:	48 8d 35 6d 6f 00 00 	lea    0x6f6d(%rip),%rsi        # b470 <_IO_stdin_used+0x470>
    4503:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4507:	e8 74 cb ff ff       	callq  1080 <__printf_chk@plt>
    450c:	48 8d 35 1f 57 00 00 	lea    0x571f(%rip),%rsi        # 9c32 <mul64>
    4513:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4519:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    451e:	e8 1d 65 00 00       	callq  aa40 <measureFunction>
    4523:	bf 01 00 00 00       	mov    $0x1,%edi
    4528:	b8 01 00 00 00       	mov    $0x1,%eax
    452d:	48 8d 35 21 6f 00 00 	lea    0x6f21(%rip),%rsi        # b455 <_IO_stdin_used+0x455>
    4534:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4538:	e8 43 cb ff ff       	callq  1080 <__printf_chk@plt>
    453d:	48 8d 35 d0 57 00 00 	lea    0x57d0(%rip),%rsi        # 9d14 <mixmul16mul64>
    4544:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    454a:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    454f:	e8 ec 64 00 00       	callq  aa40 <measureFunction>
    4554:	bf 01 00 00 00       	mov    $0x1,%edi
    4559:	b8 01 00 00 00       	mov    $0x1,%eax
    455e:	48 8d 35 63 75 00 00 	lea    0x7563(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>
    4565:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4569:	e8 12 cb ff ff       	callq  1080 <__printf_chk@plt>
    456e:	48 8d 35 4b 58 00 00 	lea    0x584b(%rip),%rsi        # 9dc0 <mixmul16mul64_21>
    4575:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    457b:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4580:	e8 bb 64 00 00       	callq  aa40 <measureFunction>
    4585:	bf 01 00 00 00       	mov    $0x1,%edi
    458a:	b8 01 00 00 00       	mov    $0x1,%eax
    458f:	48 8d 35 02 75 00 00 	lea    0x7502(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>
    4596:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    459a:	e8 e1 ca ff ff       	callq  1080 <__printf_chk@plt>
    459f:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    45a5:	48 8d 35 c4 63 00 00 	lea    0x63c4(%rip),%rsi        # a970 <load128wrapper>
    45ac:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    45b1:	e8 8a 64 00 00       	callq  aa40 <measureFunction>
    45b6:	bf 01 00 00 00       	mov    $0x1,%edi
    45bb:	b8 01 00 00 00       	mov    $0x1,%eax
    45c0:	48 8d 35 71 6e 00 00 	lea    0x6e71(%rip),%rsi        # b438 <_IO_stdin_used+0x438>
    45c7:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    45cb:	e8 b0 ca ff ff       	callq  1080 <__printf_chk@plt>
    45d0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    45d6:	48 8d 35 a3 63 00 00 	lea    0x63a3(%rip),%rsi        # a980 <spacedload128wrapper>
    45dd:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    45e2:	e8 59 64 00 00       	callq  aa40 <measureFunction>
    45e7:	bf 01 00 00 00       	mov    $0x1,%edi
    45ec:	b8 01 00 00 00       	mov    $0x1,%eax
    45f1:	48 8d 35 78 74 00 00 	lea    0x7478(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>
    45f8:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    45fc:	e8 7f ca ff ff       	callq  1080 <__printf_chk@plt>
    4601:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4607:	48 8d 35 92 63 00 00 	lea    0x6392(%rip),%rsi        # a9a0 <load256wrapper>
    460e:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4613:	e8 28 64 00 00       	callq  aa40 <measureFunction>
    4618:	bf 01 00 00 00       	mov    $0x1,%edi
    461d:	b8 01 00 00 00       	mov    $0x1,%eax
    4622:	48 8d 35 f2 6d 00 00 	lea    0x6df2(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>
    4629:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    462d:	e8 4e ca ff ff       	callq  1080 <__printf_chk@plt>
    4632:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4638:	48 8d 35 51 63 00 00 	lea    0x6351(%rip),%rsi        # a990 <spacedstorescalarwrapper>
    463f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4644:	e8 f7 63 00 00       	callq  aa40 <measureFunction>
    4649:	bf 01 00 00 00       	mov    $0x1,%edi
    464e:	b8 01 00 00 00       	mov    $0x1,%eax
    4653:	48 8d 35 ee 73 00 00 	lea    0x73ee(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>
    465a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    465e:	e8 1d ca ff ff       	callq  1080 <__printf_chk@plt>
    4663:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4669:	48 8d 35 50 63 00 00 	lea    0x6350(%rip),%rsi        # a9c0 <store128wrapper>
    4670:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4675:	e8 c6 63 00 00       	callq  aa40 <measureFunction>
    467a:	bf 01 00 00 00       	mov    $0x1,%edi
    467f:	b8 01 00 00 00       	mov    $0x1,%eax
    4684:	48 8d 35 72 6d 00 00 	lea    0x6d72(%rip),%rsi        # b3fd <_IO_stdin_used+0x3fd>
    468b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    468f:	e8 ec c9 ff ff       	callq  1080 <__printf_chk@plt>
    4694:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    469a:	48 8d 35 3f 63 00 00 	lea    0x633f(%rip),%rsi        # a9e0 <store256wrapper>
    46a1:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    46a6:	e8 95 63 00 00       	callq  aa40 <measureFunction>
    46ab:	bf 01 00 00 00       	mov    $0x1,%edi
    46b0:	b8 01 00 00 00       	mov    $0x1,%eax
    46b5:	48 8d 35 23 6d 00 00 	lea    0x6d23(%rip),%rsi        # b3df <_IO_stdin_used+0x3df>
    46bc:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    46c0:	e8 bb c9 ff ff       	callq  1080 <__printf_chk@plt>
    46c5:	e9 4b e2 ff ff       	jmpq   2915 <main+0x1855>
    46ca:	48 8b 0d 0f 9a 00 00 	mov    0x9a0f(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>
    46d1:	ba 0f 00 00 00       	mov    $0xf,%edx
    46d6:	be 01 00 00 00       	mov    $0x1,%esi
    46db:	48 8d 3d 59 69 00 00 	lea    0x6959(%rip),%rdi        # b03b <_IO_stdin_used+0x3b>
    46e2:	e8 a9 c9 ff ff       	callq  1090 <fwrite@plt>
    46e7:	e9 91 ca ff ff       	jmpq   117d <main+0xbd>
    46ec:	48 8b 0d ed 99 00 00 	mov    0x99ed(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>
    46f3:	ba 0f 00 00 00       	mov    $0xf,%edx
    46f8:	be 01 00 00 00       	mov    $0x1,%esi
    46fd:	48 8d 3d 27 69 00 00 	lea    0x6927(%rip),%rdi        # b02b <_IO_stdin_used+0x2b>
    4704:	e8 87 c9 ff ff       	callq  1090 <fwrite@plt>
    4709:	e9 64 ca ff ff       	jmpq   1172 <main+0xb2>
    470e:	48 8b 0d cb 99 00 00 	mov    0x99cb(%rip),%rcx        # e0e0 <stderr@@GLIBC_2.2.5>
    4715:	ba 0e 00 00 00       	mov    $0xe,%edx
    471a:	be 01 00 00 00       	mov    $0x1,%esi
    471f:	48 8d 3d f6 68 00 00 	lea    0x68f6(%rip),%rdi        # b01c <_IO_stdin_used+0x1c>
    4726:	e8 65 c9 ff ff       	callq  1090 <fwrite@plt>
    472b:	e9 37 ca ff ff       	jmpq   1167 <main+0xa7>
    4730:	48 8b 7d 10          	mov    0x10(%rbp),%rdi
    4734:	31 f6                	xor    %esi,%esi
    4736:	ba 0a 00 00 00       	mov    $0xa,%edx
    473b:	e8 30 c9 ff ff       	callq  1070 <strtol@plt>
    4740:	48 8d 35 bd 68 00 00 	lea    0x68bd(%rip),%rsi        # b004 <_IO_stdin_used+0x4>
    4747:	bf 01 00 00 00       	mov    $0x1,%edi
    474c:	4c 69 f0 00 2f 68 59 	imul   $0x59682f00,%rax,%r14
    4753:	31 c0                	xor    %eax,%eax
    4755:	4c 89 f2             	mov    %r14,%rdx
    4758:	e8 23 c9 ff ff       	callq  1080 <__printf_chk@plt>
    475d:	e9 f3 c9 ff ff       	jmpq   1155 <main+0x95>
    4762:	48 8d 35 cf 4e 00 00 	lea    0x4ecf(%rip),%rsi        # 9638 <latfma512>
    4769:	f3 0f 10 05 17 79 00 	movss  0x7917(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    4770:	00 
    4771:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4776:	e8 c5 62 00 00       	callq  aa40 <measureFunction>
    477b:	bf 01 00 00 00       	mov    $0x1,%edi
    4780:	b8 01 00 00 00       	mov    $0x1,%eax
    4785:	f3 0f 10 2d fb 78 00 	movss  0x78fb(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>
    478c:	00 
    478d:	48 8d 35 0c 70 00 00 	lea    0x700c(%rip),%rsi        # b7a0 <_IO_stdin_used+0x7a0>
    4794:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    4798:	66 0f ef c0          	pxor   %xmm0,%xmm0
    479c:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    47a0:	e8 db c8 ff ff       	callq  1080 <__printf_chk@plt>
    47a5:	41 83 fc 01          	cmp    $0x1,%r12d
    47a9:	0f 8e 97 e1 ff ff    	jle    2946 <main+0x1886>
    47af:	4c 8b 6d 08          	mov    0x8(%rbp),%r13
    47b3:	ba 0f 00 00 00       	mov    $0xf,%edx
    47b8:	48 8d 35 48 69 00 00 	lea    0x6948(%rip),%rsi        # b107 <_IO_stdin_used+0x107>
    47bf:	4c 89 ef             	mov    %r13,%rdi
    47c2:	e8 69 c8 ff ff       	callq  1030 <strncmp@plt>
    47c7:	85 c0                	test   %eax,%eax
    47c9:	0f 85 4d 1c 00 00    	jne    641c <main+0x535c>
    47cf:	48 8d 35 02 45 00 00 	lea    0x4502(%rip),%rsi        # 8cd8 <mixfma256fma512>
    47d6:	f3 0f 10 05 aa 78 00 	movss  0x78aa(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    47dd:	00 
    47de:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    47e3:	e8 58 62 00 00       	callq  aa40 <measureFunction>
    47e8:	bf 01 00 00 00       	mov    $0x1,%edi
    47ed:	b8 01 00 00 00       	mov    $0x1,%eax
    47f2:	48 8d 35 cf 6f 00 00 	lea    0x6fcf(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>
    47f9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    47fd:	e8 7e c8 ff ff       	callq  1080 <__printf_chk@plt>
    4802:	e9 51 ea ff ff       	jmpq   3258 <main+0x2198>
    4807:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    480d:	48 8d 35 7c 61 00 00 	lea    0x617c(%rip),%rsi        # a990 <spacedstorescalarwrapper>
    4814:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4819:	e8 22 62 00 00       	callq  aa40 <measureFunction>
    481e:	bf 01 00 00 00       	mov    $0x1,%edi
    4823:	b8 01 00 00 00       	mov    $0x1,%eax
    4828:	48 8d 35 19 72 00 00 	lea    0x7219(%rip),%rsi        # ba48 <_IO_stdin_used+0xa48>
    482f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4833:	e8 48 c8 ff ff       	callq  1080 <__printf_chk@plt>
    4838:	41 83 fc 01          	cmp    $0x1,%r12d
    483c:	0f 8f 17 e0 ff ff    	jg     2859 <main+0x1799>
    4842:	e9 ff e0 ff ff       	jmpq   2946 <main+0x1886>
    4847:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    484d:	48 8d 35 4c 61 00 00 	lea    0x614c(%rip),%rsi        # a9a0 <load256wrapper>
    4854:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4859:	e8 e2 61 00 00       	callq  aa40 <measureFunction>
    485e:	bf 01 00 00 00       	mov    $0x1,%edi
    4863:	b8 01 00 00 00       	mov    $0x1,%eax
    4868:	48 8d 35 ac 6b 00 00 	lea    0x6bac(%rip),%rsi        # b41b <_IO_stdin_used+0x41b>
    486f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4873:	e8 08 c8 ff ff       	callq  1080 <__printf_chk@plt>
    4878:	41 83 fc 01          	cmp    $0x1,%r12d
    487c:	0f 8f 86 df ff ff    	jg     2808 <main+0x1748>
    4882:	e9 bf e0 ff ff       	jmpq   2946 <main+0x1886>
    4887:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    488d:	48 8d 35 ec 60 00 00 	lea    0x60ec(%rip),%rsi        # a980 <spacedload128wrapper>
    4894:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4899:	e8 a2 61 00 00       	callq  aa40 <measureFunction>
    489e:	bf 01 00 00 00       	mov    $0x1,%edi
    48a3:	b8 01 00 00 00       	mov    $0x1,%eax
    48a8:	48 8d 35 c1 71 00 00 	lea    0x71c1(%rip),%rsi        # ba70 <_IO_stdin_used+0xa70>
    48af:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    48b3:	e8 c8 c7 ff ff       	callq  1080 <__printf_chk@plt>
    48b8:	41 83 fc 01          	cmp    $0x1,%r12d
    48bc:	0f 8f f5 de ff ff    	jg     27b7 <main+0x16f7>
    48c2:	e9 7f e0 ff ff       	jmpq   2946 <main+0x1886>
    48c7:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    48cd:	48 8d 35 9c 60 00 00 	lea    0x609c(%rip),%rsi        # a970 <load128wrapper>
    48d4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    48d9:	e8 62 61 00 00       	callq  aa40 <measureFunction>
    48de:	bf 01 00 00 00       	mov    $0x1,%edi
    48e3:	b8 01 00 00 00       	mov    $0x1,%eax
    48e8:	48 8d 35 49 6b 00 00 	lea    0x6b49(%rip),%rsi        # b438 <_IO_stdin_used+0x438>
    48ef:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    48f3:	e8 88 c7 ff ff       	callq  1080 <__printf_chk@plt>
    48f8:	41 83 fc 01          	cmp    $0x1,%r12d
    48fc:	0f 8f 64 de ff ff    	jg     2766 <main+0x16a6>
    4902:	e9 3f e0 ff ff       	jmpq   2946 <main+0x1886>
    4907:	48 8d 35 b2 54 00 00 	lea    0x54b2(%rip),%rsi        # 9dc0 <mixmul16mul64_21>
    490e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4914:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4919:	e8 22 61 00 00       	callq  aa40 <measureFunction>
    491e:	bf 01 00 00 00       	mov    $0x1,%edi
    4923:	b8 01 00 00 00       	mov    $0x1,%eax
    4928:	48 8d 35 69 71 00 00 	lea    0x7169(%rip),%rsi        # ba98 <_IO_stdin_used+0xa98>
    492f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4933:	e8 48 c7 ff ff       	callq  1080 <__printf_chk@plt>
    4938:	41 83 fc 01          	cmp    $0x1,%r12d
    493c:	0f 8f d3 dd ff ff    	jg     2715 <main+0x1655>
    4942:	e9 ff df ff ff       	jmpq   2946 <main+0x1886>
    4947:	48 8d 35 c6 53 00 00 	lea    0x53c6(%rip),%rsi        # 9d14 <mixmul16mul64>
    494e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4954:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4959:	e8 e2 60 00 00       	callq  aa40 <measureFunction>
    495e:	bf 01 00 00 00       	mov    $0x1,%edi
    4963:	b8 01 00 00 00       	mov    $0x1,%eax
    4968:	48 8d 35 59 71 00 00 	lea    0x7159(%rip),%rsi        # bac8 <_IO_stdin_used+0xac8>
    496f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4973:	e8 08 c7 ff ff       	callq  1080 <__printf_chk@plt>
    4978:	41 83 fc 01          	cmp    $0x1,%r12d
    497c:	0f 8f 42 dd ff ff    	jg     26c4 <main+0x1604>
    4982:	e9 bf df ff ff       	jmpq   2946 <main+0x1886>
    4987:	48 8d 35 a4 52 00 00 	lea    0x52a4(%rip),%rsi        # 9c32 <mul64>
    498e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4994:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4999:	e8 a2 60 00 00       	callq  aa40 <measureFunction>
    499e:	bf 01 00 00 00       	mov    $0x1,%edi
    49a3:	b8 01 00 00 00       	mov    $0x1,%eax
    49a8:	48 8d 35 a6 6a 00 00 	lea    0x6aa6(%rip),%rsi        # b455 <_IO_stdin_used+0x455>
    49af:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    49b3:	e8 c8 c6 ff ff       	callq  1080 <__printf_chk@plt>
    49b8:	41 83 fc 01          	cmp    $0x1,%r12d
    49bc:	0f 8f b1 dc ff ff    	jg     2673 <main+0x15b3>
    49c2:	e9 7f df ff ff       	jmpq   2946 <main+0x1886>
    49c7:	48 8d 35 b0 51 00 00 	lea    0x51b0(%rip),%rsi        # 9b7e <mul16>
    49ce:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    49d4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    49d9:	e8 62 60 00 00       	callq  aa40 <measureFunction>
    49de:	bf 01 00 00 00       	mov    $0x1,%edi
    49e3:	b8 01 00 00 00       	mov    $0x1,%eax
    49e8:	48 8d 35 81 6a 00 00 	lea    0x6a81(%rip),%rsi        # b470 <_IO_stdin_used+0x470>
    49ef:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    49f3:	e8 88 c6 ff ff       	callq  1080 <__printf_chk@plt>
    49f8:	41 83 fc 01          	cmp    $0x1,%r12d
    49fc:	0f 8f 20 dc ff ff    	jg     2622 <main+0x1562>
    4a02:	e9 3f df ff ff       	jmpq   2946 <main+0x1886>
    4a07:	48 8d 35 1c 50 00 00 	lea    0x501c(%rip),%rsi        # 9a2a <latmul64>
    4a0e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4a14:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4a19:	e8 22 60 00 00       	callq  aa40 <measureFunction>
    4a1e:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    4a24:	bf 01 00 00 00       	mov    $0x1,%edi
    4a29:	48 8d 35 c8 70 00 00 	lea    0x70c8(%rip),%rsi        # baf8 <_IO_stdin_used+0xaf8>
    4a30:	b8 01 00 00 00       	mov    $0x1,%eax
    4a35:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    4a39:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4a3d:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    4a41:	e8 3a c6 ff ff       	callq  1080 <__printf_chk@plt>
    4a46:	41 83 fc 01          	cmp    $0x1,%r12d
    4a4a:	0f 8f 81 db ff ff    	jg     25d1 <main+0x1511>
    4a50:	e9 f1 de ff ff       	jmpq   2946 <main+0x1886>
    4a55:	48 8d 35 6e 50 00 00 	lea    0x506e(%rip),%rsi        # 9aca <latmul16>
    4a5c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4a62:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4a67:	e8 d4 5f 00 00       	callq  aa40 <measureFunction>
    4a6c:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    4a72:	bf 01 00 00 00       	mov    $0x1,%edi
    4a77:	48 8d 35 a2 70 00 00 	lea    0x70a2(%rip),%rsi        # bb20 <_IO_stdin_used+0xb20>
    4a7e:	b8 01 00 00 00       	mov    $0x1,%eax
    4a83:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    4a87:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4a8b:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    4a8f:	e8 ec c5 ff ff       	callq  1080 <__printf_chk@plt>
    4a94:	41 83 fc 01          	cmp    $0x1,%r12d
    4a98:	0f 8f d4 da ff ff    	jg     2572 <main+0x14b2>
    4a9e:	e9 a3 de ff ff       	jmpq   2946 <main+0x1886>
    4aa3:	48 8d 35 4a 31 00 00 	lea    0x314a(%rip),%rsi        # 7bf4 <mix256faddintadd>
    4aaa:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4ab0:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4ab5:	e8 86 5f 00 00       	callq  aa40 <measureFunction>
    4aba:	bf 01 00 00 00       	mov    $0x1,%edi
    4abf:	b8 01 00 00 00       	mov    $0x1,%eax
    4ac4:	48 8d 35 7d 70 00 00 	lea    0x707d(%rip),%rsi        # bb48 <_IO_stdin_used+0xb48>
    4acb:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4acf:	e8 ac c5 ff ff       	callq  1080 <__printf_chk@plt>
    4ad4:	41 83 fc 01          	cmp    $0x1,%r12d
    4ad8:	0f 8f 35 da ff ff    	jg     2513 <main+0x1453>
    4ade:	e9 63 de ff ff       	jmpq   2946 <main+0x1886>
    4ae3:	48 8d 35 ae 4a 00 00 	lea    0x4aae(%rip),%rsi        # 9598 <nemesfpumix21>
    4aea:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4af0:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4af5:	48 c1 e7 09          	shl    $0x9,%rdi
    4af9:	e8 42 5f 00 00       	callq  aa40 <measureFunction>
    4afe:	bf 01 00 00 00       	mov    $0x1,%edi
    4b03:	b8 01 00 00 00       	mov    $0x1,%eax
    4b08:	48 8d 35 61 70 00 00 	lea    0x7061(%rip),%rsi        # bb70 <_IO_stdin_used+0xb70>
    4b0f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4b13:	e8 68 c5 ff ff       	callq  1080 <__printf_chk@plt>
    4b18:	41 83 fc 01          	cmp    $0x1,%r12d
    4b1c:	0f 8f a0 d9 ff ff    	jg     24c2 <main+0x1402>
    4b22:	e9 1f de ff ff       	jmpq   2946 <main+0x1886>
    4b27:	48 8d 35 54 47 00 00 	lea    0x4754(%rip),%rsi        # 9282 <mixfmaand256>
    4b2e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4b34:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4b39:	48 c1 e7 09          	shl    $0x9,%rdi
    4b3d:	e8 fe 5e 00 00       	callq  aa40 <measureFunction>
    4b42:	bf 01 00 00 00       	mov    $0x1,%edi
    4b47:	b8 01 00 00 00       	mov    $0x1,%eax
    4b4c:	48 8d 35 4d 70 00 00 	lea    0x704d(%rip),%rsi        # bba0 <_IO_stdin_used+0xba0>
    4b53:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4b57:	e8 24 c5 ff ff       	callq  1080 <__printf_chk@plt>
    4b5c:	41 83 fc 01          	cmp    $0x1,%r12d
    4b60:	0f 8f b2 d8 ff ff    	jg     2418 <main+0x1358>
    4b66:	e9 db dd ff ff       	jmpq   2946 <main+0x1886>
    4b6b:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4b71:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4b76:	48 8d 35 b3 5e 00 00 	lea    0x5eb3(%rip),%rsi        # aa30 <mixfmaaddmem256wrapper>
    4b7d:	48 c1 e7 09          	shl    $0x9,%rdi
    4b81:	e8 ba 5e 00 00       	callq  aa40 <measureFunction>
    4b86:	bf 01 00 00 00       	mov    $0x1,%edi
    4b8b:	b8 01 00 00 00       	mov    $0x1,%eax
    4b90:	48 8d 35 31 70 00 00 	lea    0x7031(%rip),%rsi        # bbc8 <_IO_stdin_used+0xbc8>
    4b97:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4b9b:	e8 e0 c4 ff ff       	callq  1080 <__printf_chk@plt>
    4ba0:	41 83 fc 01          	cmp    $0x1,%r12d
    4ba4:	0f 8f 1d d8 ff ff    	jg     23c7 <main+0x1307>
    4baa:	e9 97 dd ff ff       	jmpq   2946 <main+0x1886>
    4baf:	48 8d 35 1b 46 00 00 	lea    0x461b(%rip),%rsi        # 91d1 <mixfmaadd256>
    4bb6:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4bbc:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4bc1:	48 c1 e7 09          	shl    $0x9,%rdi
    4bc5:	e8 76 5e 00 00       	callq  aa40 <measureFunction>
    4bca:	bf 01 00 00 00       	mov    $0x1,%edi
    4bcf:	b8 01 00 00 00       	mov    $0x1,%eax
    4bd4:	48 8d 35 1d 70 00 00 	lea    0x701d(%rip),%rsi        # bbf8 <_IO_stdin_used+0xbf8>
    4bdb:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4bdf:	e8 9c c4 ff ff       	callq  1080 <__printf_chk@plt>
    4be4:	41 83 fc 01          	cmp    $0x1,%r12d
    4be8:	0f 8f 84 d7 ff ff    	jg     2372 <main+0x12b2>
    4bee:	e9 53 dd ff ff       	jmpq   2946 <main+0x1886>
    4bf3:	48 8d 35 28 43 00 00 	lea    0x4328(%rip),%rsi        # 8f22 <mixfmafadd256>
    4bfa:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4c00:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    4c05:	48 c1 e7 09          	shl    $0x9,%rdi
    4c09:	e8 32 5e 00 00       	callq  aa40 <measureFunction>
    4c0e:	bf 01 00 00 00       	mov    $0x1,%edi
    4c13:	b8 01 00 00 00       	mov    $0x1,%eax
    4c18:	48 8d 35 01 70 00 00 	lea    0x7001(%rip),%rsi        # bc20 <_IO_stdin_used+0xc20>
    4c1f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4c23:	e8 58 c4 ff ff       	callq  1080 <__printf_chk@plt>
    4c28:	41 83 fc 01          	cmp    $0x1,%r12d
    4c2c:	0f 8f eb d6 ff ff    	jg     231d <main+0x125d>
    4c32:	e9 0f dd ff ff       	jmpq   2946 <main+0x1886>
    4c37:	48 8d 35 ff 3d 00 00 	lea    0x3dff(%rip),%rsi        # 8a3d <mul256fp>
    4c3e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4c44:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4c49:	e8 f2 5d 00 00       	callq  aa40 <measureFunction>
    4c4e:	bf 01 00 00 00       	mov    $0x1,%edi
    4c53:	b8 01 00 00 00       	mov    $0x1,%eax
    4c58:	48 8d 35 2c 68 00 00 	lea    0x682c(%rip),%rsi        # b48b <_IO_stdin_used+0x48b>
    4c5f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4c63:	e8 18 c4 ff ff       	callq  1080 <__printf_chk@plt>
    4c68:	41 83 fc 01          	cmp    $0x1,%r12d
    4c6c:	0f 8f 56 d6 ff ff    	jg     22c8 <main+0x1208>
    4c72:	e9 cf dc ff ff       	jmpq   2946 <main+0x1886>
    4c77:	48 8d 35 5e 3e 00 00 	lea    0x3e5e(%rip),%rsi        # 8adc <add256fp>
    4c7e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4c84:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4c89:	e8 b2 5d 00 00       	callq  aa40 <measureFunction>
    4c8e:	bf 01 00 00 00       	mov    $0x1,%edi
    4c93:	b8 01 00 00 00       	mov    $0x1,%eax
    4c98:	48 8d 35 08 68 00 00 	lea    0x6808(%rip),%rsi        # b4a7 <_IO_stdin_used+0x4a7>
    4c9f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4ca3:	e8 d8 c3 ff ff       	callq  1080 <__printf_chk@plt>
    4ca8:	41 83 fc 01          	cmp    $0x1,%r12d
    4cac:	0f 8f c5 d5 ff ff    	jg     2277 <main+0x11b7>
    4cb2:	e9 8f dc ff ff       	jmpq   2946 <main+0x1886>
    4cb7:	48 8d 35 13 4b 00 00 	lea    0x4b13(%rip),%rsi        # 97d1 <latfma128>
    4cbe:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4cc4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4cc9:	e8 72 5d 00 00       	callq  aa40 <measureFunction>
    4cce:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    4cd4:	bf 01 00 00 00       	mov    $0x1,%edi
    4cd9:	48 8d 35 68 6f 00 00 	lea    0x6f68(%rip),%rsi        # bc48 <_IO_stdin_used+0xc48>
    4ce0:	b8 01 00 00 00       	mov    $0x1,%eax
    4ce5:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    4ce9:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4ced:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    4cf1:	e8 8a c3 ff ff       	callq  1080 <__printf_chk@plt>
    4cf6:	41 83 fc 01          	cmp    $0x1,%r12d
    4cfa:	0f 8f 26 d5 ff ff    	jg     2226 <main+0x1166>
    4d00:	e9 41 dc ff ff       	jmpq   2946 <main+0x1886>
    4d05:	48 8d 35 0d 4a 00 00 	lea    0x4a0d(%rip),%rsi        # 9719 <latfma256>
    4d0c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4d12:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4d17:	e8 24 5d 00 00       	callq  aa40 <measureFunction>
    4d1c:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    4d22:	bf 01 00 00 00       	mov    $0x1,%edi
    4d27:	48 8d 35 42 6f 00 00 	lea    0x6f42(%rip),%rsi        # bc70 <_IO_stdin_used+0xc70>
    4d2e:	b8 01 00 00 00       	mov    $0x1,%eax
    4d33:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    4d37:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4d3b:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    4d3f:	e8 3c c3 ff ff       	callq  1080 <__printf_chk@plt>
    4d44:	41 83 fc 01          	cmp    $0x1,%r12d
    4d48:	0f 8f 79 d4 ff ff    	jg     21c7 <main+0x1107>
    4d4e:	e9 f3 db ff ff       	jmpq   2946 <main+0x1886>
    4d53:	48 8d 35 0d 41 00 00 	lea    0x410d(%rip),%rsi        # 8e67 <fma128>
    4d5a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4d60:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4d65:	e8 d6 5c 00 00       	callq  aa40 <measureFunction>
    4d6a:	bf 01 00 00 00       	mov    $0x1,%edi
    4d6f:	b8 01 00 00 00       	mov    $0x1,%eax
    4d74:	48 8d 35 48 67 00 00 	lea    0x6748(%rip),%rsi        # b4c3 <_IO_stdin_used+0x4c3>
    4d7b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4d7f:	e8 fc c2 ff ff       	callq  1080 <__printf_chk@plt>
    4d84:	41 83 fc 01          	cmp    $0x1,%r12d
    4d88:	0f 8f da d3 ff ff    	jg     2168 <main+0x10a8>
    4d8e:	e9 b3 db ff ff       	jmpq   2946 <main+0x1886>
    4d93:	48 8d 35 15 40 00 00 	lea    0x4015(%rip),%rsi        # 8daf <fma256>
    4d9a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4da0:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4da5:	e8 96 5c 00 00       	callq  aa40 <measureFunction>
    4daa:	bf 01 00 00 00       	mov    $0x1,%edi
    4daf:	b8 01 00 00 00       	mov    $0x1,%eax
    4db4:	48 8d 35 23 67 00 00 	lea    0x6723(%rip),%rsi        # b4de <_IO_stdin_used+0x4de>
    4dbb:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4dbf:	e8 bc c2 ff ff       	callq  1080 <__printf_chk@plt>
    4dc4:	41 83 fc 01          	cmp    $0x1,%r12d
    4dc8:	0f 8f 49 d3 ff ff    	jg     2117 <main+0x1057>
    4dce:	e9 73 db ff ff       	jmpq   2946 <main+0x1886>
    4dd3:	48 8d 35 43 3a 00 00 	lea    0x3a43(%rip),%rsi        # 881d <mul128int>
    4dda:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4de0:	4c 89 f7             	mov    %r14,%rdi
    4de3:	e8 58 5c 00 00       	callq  aa40 <measureFunction>
    4de8:	bf 01 00 00 00       	mov    $0x1,%edi
    4ded:	b8 01 00 00 00       	mov    $0x1,%eax
    4df2:	48 8d 35 9f 6e 00 00 	lea    0x6e9f(%rip),%rsi        # bc98 <_IO_stdin_used+0xc98>
    4df9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4dfd:	e8 7e c2 ff ff       	callq  1080 <__printf_chk@plt>
    4e02:	41 83 fc 01          	cmp    $0x1,%r12d
    4e06:	0f 8f ba d2 ff ff    	jg     20c6 <main+0x1006>
    4e0c:	e9 35 db ff ff       	jmpq   2946 <main+0x1886>
    4e11:	48 8d 35 9c 35 00 00 	lea    0x359c(%rip),%rsi        # 83b4 <add128int>
    4e18:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4e1e:	4c 89 f7             	mov    %r14,%rdi
    4e21:	e8 1a 5c 00 00       	callq  aa40 <measureFunction>
    4e26:	bf 01 00 00 00       	mov    $0x1,%edi
    4e2b:	b8 01 00 00 00       	mov    $0x1,%eax
    4e30:	48 8d 35 89 6e 00 00 	lea    0x6e89(%rip),%rsi        # bcc0 <_IO_stdin_used+0xcc0>
    4e37:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4e3b:	e8 40 c2 ff ff       	callq  1080 <__printf_chk@plt>
    4e40:	41 83 fc 01          	cmp    $0x1,%r12d
    4e44:	0f 8f 2d d2 ff ff    	jg     2077 <main+0xfb7>
    4e4a:	e9 f7 da ff ff       	jmpq   2946 <main+0x1886>
    4e4f:	48 8d 35 f6 4a 00 00 	lea    0x4af6(%rip),%rsi        # 994c <mul128fp>
    4e56:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4e5c:	4c 89 f7             	mov    %r14,%rdi
    4e5f:	e8 dc 5b 00 00       	callq  aa40 <measureFunction>
    4e64:	bf 01 00 00 00       	mov    $0x1,%edi
    4e69:	b8 01 00 00 00       	mov    $0x1,%eax
    4e6e:	48 8d 35 73 6e 00 00 	lea    0x6e73(%rip),%rsi        # bce8 <_IO_stdin_used+0xce8>
    4e75:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4e79:	e8 02 c2 ff ff       	callq  1080 <__printf_chk@plt>
    4e7e:	41 83 fc 01          	cmp    $0x1,%r12d
    4e82:	0f 8f a0 d1 ff ff    	jg     2028 <main+0xf68>
    4e88:	e9 b9 da ff ff       	jmpq   2946 <main+0x1886>
    4e8d:	48 8d 35 27 4b 00 00 	lea    0x4b27(%rip),%rsi        # 99bb <add128fp>
    4e94:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4e9a:	4c 89 f7             	mov    %r14,%rdi
    4e9d:	e8 9e 5b 00 00       	callq  aa40 <measureFunction>
    4ea2:	bf 01 00 00 00       	mov    $0x1,%edi
    4ea7:	b8 01 00 00 00       	mov    $0x1,%eax
    4eac:	48 8d 35 55 6e 00 00 	lea    0x6e55(%rip),%rsi        # bd08 <_IO_stdin_used+0xd08>
    4eb3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    4eb7:	e8 c4 c1 ff ff       	callq  1080 <__printf_chk@plt>
    4ebc:	41 83 fc 01          	cmp    $0x1,%r12d
    4ec0:	0f 8f 13 d1 ff ff    	jg     1fd9 <main+0xf19>
    4ec6:	e9 7b da ff ff       	jmpq   2946 <main+0x1886>
    4ecb:	48 8d 35 1a 4a 00 00 	lea    0x4a1a(%rip),%rsi        # 98ec <latmul128fp>
    4ed2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4ed8:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4edd:	e8 5e 5b 00 00       	callq  aa40 <measureFunction>
    4ee2:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    4ee8:	bf 01 00 00 00       	mov    $0x1,%edi
    4eed:	48 8d 35 34 6e 00 00 	lea    0x6e34(%rip),%rsi        # bd28 <_IO_stdin_used+0xd28>
    4ef4:	b8 01 00 00 00       	mov    $0x1,%eax
    4ef9:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    4efd:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4f01:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    4f05:	e8 76 c1 ff ff       	callq  1080 <__printf_chk@plt>
    4f0a:	41 83 fc 01          	cmp    $0x1,%r12d
    4f0e:	0f 8f 76 d0 ff ff    	jg     1f8a <main+0xeca>
    4f14:	e9 2d da ff ff       	jmpq   2946 <main+0x1886>
    4f19:	48 8d 35 6c 49 00 00 	lea    0x496c(%rip),%rsi        # 988c <latadd128fp>
    4f20:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4f26:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4f2b:	e8 10 5b 00 00       	callq  aa40 <measureFunction>
    4f30:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    4f36:	bf 01 00 00 00       	mov    $0x1,%edi
    4f3b:	48 8d 35 0e 6e 00 00 	lea    0x6e0e(%rip),%rsi        # bd50 <_IO_stdin_used+0xd50>
    4f42:	b8 01 00 00 00       	mov    $0x1,%eax
    4f47:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    4f4b:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4f4f:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    4f53:	e8 28 c1 ff ff       	callq  1080 <__printf_chk@plt>
    4f58:	41 83 fc 01          	cmp    $0x1,%r12d
    4f5c:	0f 8f c9 cf ff ff    	jg     1f2b <main+0xe6b>
    4f62:	e9 df d9 ff ff       	jmpq   2946 <main+0x1886>
    4f67:	48 8d 35 0d 3c 00 00 	lea    0x3c0d(%rip),%rsi        # 8b7b <latmul256fp>
    4f6e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4f74:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4f79:	e8 c2 5a 00 00       	callq  aa40 <measureFunction>
    4f7e:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    4f84:	bf 01 00 00 00       	mov    $0x1,%edi
    4f89:	48 8d 35 e8 6d 00 00 	lea    0x6de8(%rip),%rsi        # bd78 <_IO_stdin_used+0xd78>
    4f90:	b8 01 00 00 00       	mov    $0x1,%eax
    4f95:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    4f99:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4f9d:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    4fa1:	e8 da c0 ff ff       	callq  1080 <__printf_chk@plt>
    4fa6:	41 83 fc 01          	cmp    $0x1,%r12d
    4faa:	0f 8f 1c cf ff ff    	jg     1ecc <main+0xe0c>
    4fb0:	e9 91 d9 ff ff       	jmpq   2946 <main+0x1886>
    4fb5:	48 8d 35 05 3a 00 00 	lea    0x3a05(%rip),%rsi        # 89c1 <latadd256fp>
    4fbc:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    4fc2:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    4fc7:	e8 74 5a 00 00       	callq  aa40 <measureFunction>
    4fcc:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    4fd2:	bf 01 00 00 00       	mov    $0x1,%edi
    4fd7:	48 8d 35 c2 6d 00 00 	lea    0x6dc2(%rip),%rsi        # bda0 <_IO_stdin_used+0xda0>
    4fde:	b8 01 00 00 00       	mov    $0x1,%eax
    4fe3:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    4fe7:	66 0f ef c0          	pxor   %xmm0,%xmm0
    4feb:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    4fef:	e8 8c c0 ff ff       	callq  1080 <__printf_chk@plt>
    4ff4:	41 83 fc 01          	cmp    $0x1,%r12d
    4ff8:	0f 8f 6f ce ff ff    	jg     1e6d <main+0xdad>
    4ffe:	e9 43 d9 ff ff       	jmpq   2946 <main+0x1886>
    5003:	48 8d 35 92 38 00 00 	lea    0x3892(%rip),%rsi        # 889c <latmul128int>
    500a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5010:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5015:	e8 26 5a 00 00       	callq  aa40 <measureFunction>
    501a:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    5020:	bf 01 00 00 00       	mov    $0x1,%edi
    5025:	48 8d 35 9c 6d 00 00 	lea    0x6d9c(%rip),%rsi        # bdc8 <_IO_stdin_used+0xdc8>
    502c:	b8 01 00 00 00       	mov    $0x1,%eax
    5031:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    5035:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5039:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    503d:	e8 3e c0 ff ff       	callq  1080 <__printf_chk@plt>
    5042:	41 83 fc 01          	cmp    $0x1,%r12d
    5046:	0f 8f c2 cd ff ff    	jg     1e0e <main+0xd4e>
    504c:	e9 f5 d8 ff ff       	jmpq   2946 <main+0x1886>
    5051:	48 8d 35 f1 32 00 00 	lea    0x32f1(%rip),%rsi        # 8349 <latadd128int>
    5058:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    505e:	4c 89 f7             	mov    %r14,%rdi
    5061:	e8 da 59 00 00       	callq  aa40 <measureFunction>
    5066:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    506c:	bf 01 00 00 00       	mov    $0x1,%edi
    5071:	48 8d 35 88 6d 00 00 	lea    0x6d88(%rip),%rsi        # be00 <_IO_stdin_used+0xe00>
    5078:	b8 01 00 00 00       	mov    $0x1,%eax
    507d:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    5081:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5085:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    5089:	e8 f2 bf ff ff       	callq  1080 <__printf_chk@plt>
    508e:	41 83 fc 01          	cmp    $0x1,%r12d
    5092:	0f 8f 17 cd ff ff    	jg     1daf <main+0xcef>
    5098:	e9 a9 d8 ff ff       	jmpq   2946 <main+0x1886>
    509d:	48 8d 35 f7 31 00 00 	lea    0x31f7(%rip),%rsi        # 829b <latmul256int>
    50a4:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    50aa:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    50af:	e8 8c 59 00 00       	callq  aa40 <measureFunction>
    50b4:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    50ba:	bf 01 00 00 00       	mov    $0x1,%edi
    50bf:	48 8d 35 6a 6d 00 00 	lea    0x6d6a(%rip),%rsi        # be30 <_IO_stdin_used+0xe30>
    50c6:	b8 01 00 00 00       	mov    $0x1,%eax
    50cb:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    50cf:	66 0f ef c0          	pxor   %xmm0,%xmm0
    50d3:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    50d7:	e8 a4 bf ff ff       	callq  1080 <__printf_chk@plt>
    50dc:	41 83 fc 01          	cmp    $0x1,%r12d
    50e0:	0f 8f 6c cc ff ff    	jg     1d52 <main+0xc92>
    50e6:	e9 5b d8 ff ff       	jmpq   2946 <main+0x1886>
    50eb:	48 8d 35 be 2d 00 00 	lea    0x2dbe(%rip),%rsi        # 7eb0 <latadd256int>
    50f2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    50f8:	4c 89 f7             	mov    %r14,%rdi
    50fb:	e8 40 59 00 00       	callq  aa40 <measureFunction>
    5100:	f3 0f 10 74 24 0c    	movss  0xc(%rsp),%xmm6
    5106:	bf 01 00 00 00       	mov    $0x1,%edi
    510b:	48 8d 35 56 6d 00 00 	lea    0x6d56(%rip),%rsi        # be68 <_IO_stdin_used+0xe68>
    5112:	b8 01 00 00 00       	mov    $0x1,%eax
    5117:	f3 0f 5e f0          	divss  %xmm0,%xmm6
    511b:	66 0f ef c0          	pxor   %xmm0,%xmm0
    511f:	f3 0f 5a c6          	cvtss2sd %xmm6,%xmm0
    5123:	e8 58 bf ff ff       	callq  1080 <__printf_chk@plt>
    5128:	41 83 fc 01          	cmp    $0x1,%r12d
    512c:	0f 8f c1 cb ff ff    	jg     1cf3 <main+0xc33>
    5132:	e9 0f d8 ff ff       	jmpq   2946 <main+0x1886>
    5137:	48 8d 35 62 2b 00 00 	lea    0x2b62(%rip),%rsi        # 7ca0 <mix256fp>
    513e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5144:	4c 89 f7             	mov    %r14,%rdi
    5147:	e8 f4 58 00 00       	callq  aa40 <measureFunction>
    514c:	bf 01 00 00 00       	mov    $0x1,%edi
    5151:	b8 01 00 00 00       	mov    $0x1,%eax
    5156:	48 8d 35 3b 6d 00 00 	lea    0x6d3b(%rip),%rsi        # be98 <_IO_stdin_used+0xe98>
    515d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5161:	e8 1a bf ff ff       	callq  1080 <__printf_chk@plt>
    5166:	41 83 fc 01          	cmp    $0x1,%r12d
    516a:	0f 8f 26 cb ff ff    	jg     1c96 <main+0xbd6>
    5170:	e9 d1 d7 ff ff       	jmpq   2946 <main+0x1886>
    5175:	48 8d 35 cf 29 00 00 	lea    0x29cf(%rip),%rsi        # 7b4b <mixadd256fpint>
    517c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5182:	4c 89 f7             	mov    %r14,%rdi
    5185:	e8 b6 58 00 00       	callq  aa40 <measureFunction>
    518a:	bf 01 00 00 00       	mov    $0x1,%edi
    518f:	b8 01 00 00 00       	mov    $0x1,%eax
    5194:	48 8d 35 2d 6d 00 00 	lea    0x6d2d(%rip),%rsi        # bec8 <_IO_stdin_used+0xec8>
    519b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    519f:	e8 dc be ff ff       	callq  1080 <__printf_chk@plt>
    51a4:	41 83 fc 01          	cmp    $0x1,%r12d
    51a8:	0f 8f 99 ca ff ff    	jg     1c47 <main+0xb87>
    51ae:	e9 93 d7 ff ff       	jmpq   2946 <main+0x1886>
    51b3:	48 8d 35 4f 2c 00 00 	lea    0x2c4f(%rip),%rsi        # 7e09 <mixadd256int11>
    51ba:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    51c0:	4c 89 f7             	mov    %r14,%rdi
    51c3:	e8 78 58 00 00       	callq  aa40 <measureFunction>
    51c8:	bf 01 00 00 00       	mov    $0x1,%edi
    51cd:	b8 01 00 00 00       	mov    $0x1,%eax
    51d2:	48 8d 35 1f 6d 00 00 	lea    0x6d1f(%rip),%rsi        # bef8 <_IO_stdin_used+0xef8>
    51d9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    51dd:	e8 9e be ff ff       	callq  1080 <__printf_chk@plt>
    51e2:	41 83 fc 01          	cmp    $0x1,%r12d
    51e6:	0f 8f 0c ca ff ff    	jg     1bf8 <main+0xb38>
    51ec:	e9 55 d7 ff ff       	jmpq   2946 <main+0x1886>
    51f1:	48 8d 35 4c 2b 00 00 	lea    0x2b4c(%rip),%rsi        # 7d44 <mixadd256int>
    51f8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    51fe:	4c 89 f7             	mov    %r14,%rdi
    5201:	e8 3a 58 00 00       	callq  aa40 <measureFunction>
    5206:	bf 01 00 00 00       	mov    $0x1,%edi
    520b:	b8 01 00 00 00       	mov    $0x1,%eax
    5210:	48 8d 35 19 6d 00 00 	lea    0x6d19(%rip),%rsi        # bf30 <_IO_stdin_used+0xf30>
    5217:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    521b:	e8 60 be ff ff       	callq  1080 <__printf_chk@plt>
    5220:	41 83 fc 01          	cmp    $0x1,%r12d
    5224:	0f 8f 7f c9 ff ff    	jg     1ba9 <main+0xae9>
    522a:	e9 17 d7 ff ff       	jmpq   2946 <main+0x1886>
    522f:	48 8d 35 55 26 00 00 	lea    0x2655(%rip),%rsi        # 788b <add256int>
    5236:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    523c:	4c 89 f7             	mov    %r14,%rdi
    523f:	e8 fc 57 00 00       	callq  aa40 <measureFunction>
    5244:	bf 01 00 00 00       	mov    $0x1,%edi
    5249:	b8 01 00 00 00       	mov    $0x1,%eax
    524e:	48 8d 35 13 6d 00 00 	lea    0x6d13(%rip),%rsi        # bf68 <_IO_stdin_used+0xf68>
    5255:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5259:	e8 22 be ff ff       	callq  1080 <__printf_chk@plt>
    525e:	41 83 fc 01          	cmp    $0x1,%r12d
    5262:	0f 8f f2 c8 ff ff    	jg     1b5a <main+0xa9a>
    5268:	e9 d9 d6 ff ff       	jmpq   2946 <main+0x1886>
    526d:	48 8d 35 02 21 00 00 	lea    0x2102(%rip),%rsi        # 7376 <leamultest>
    5274:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    527a:	4c 89 f7             	mov    %r14,%rdi
    527d:	e8 be 57 00 00       	callq  aa40 <measureFunction>
    5282:	bf 01 00 00 00       	mov    $0x1,%edi
    5287:	b8 01 00 00 00       	mov    $0x1,%eax
    528c:	48 8d 35 fd 6c 00 00 	lea    0x6cfd(%rip),%rsi        # bf90 <_IO_stdin_used+0xf90>
    5293:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5297:	e8 e4 bd ff ff       	callq  1080 <__printf_chk@plt>
    529c:	41 83 fc 01          	cmp    $0x1,%r12d
    52a0:	0f 8f 65 c8 ff ff    	jg     1b0b <main+0xa4b>
    52a6:	e9 9b d6 ff ff       	jmpq   2946 <main+0x1886>
    52ab:	48 8d 35 21 20 00 00 	lea    0x2021(%rip),%rsi        # 72d3 <leatest>
    52b2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    52b8:	4c 89 f7             	mov    %r14,%rdi
    52bb:	e8 80 57 00 00       	callq  aa40 <measureFunction>
    52c0:	bf 01 00 00 00       	mov    $0x1,%edi
    52c5:	b8 01 00 00 00       	mov    $0x1,%eax
    52ca:	48 8d 35 28 62 00 00 	lea    0x6228(%rip),%rsi        # b4f9 <_IO_stdin_used+0x4f9>
    52d1:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    52d5:	e8 a6 bd ff ff       	callq  1080 <__printf_chk@plt>
    52da:	41 83 fc 01          	cmp    $0x1,%r12d
    52de:	0f 8f d8 c7 ff ff    	jg     1abc <main+0x9fc>
    52e4:	e9 5d d6 ff ff       	jmpq   2946 <main+0x1886>
    52e9:	48 8d 35 a3 1e 00 00 	lea    0x1ea3(%rip),%rsi        # 7193 <rorbtstest>
    52f0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    52f6:	4c 89 f7             	mov    %r14,%rdi
    52f9:	e8 42 57 00 00       	callq  aa40 <measureFunction>
    52fe:	bf 01 00 00 00       	mov    $0x1,%edi
    5303:	b8 01 00 00 00       	mov    $0x1,%eax
    5308:	48 8d 35 03 62 00 00 	lea    0x6203(%rip),%rsi        # b512 <_IO_stdin_used+0x512>
    530f:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5313:	e8 68 bd ff ff       	callq  1080 <__printf_chk@plt>
    5318:	41 83 fc 01          	cmp    $0x1,%r12d
    531c:	0f 8f 4b c7 ff ff    	jg     1a6d <main+0x9ad>
    5322:	e9 1f d6 ff ff       	jmpq   2946 <main+0x1886>
    5327:	48 8d 35 fb 20 00 00 	lea    0x20fb(%rip),%rsi        # 7429 <btsmultest>
    532e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5334:	4c 89 f7             	mov    %r14,%rdi
    5337:	e8 04 57 00 00       	callq  aa40 <measureFunction>
    533c:	bf 01 00 00 00       	mov    $0x1,%edi
    5341:	b8 01 00 00 00       	mov    $0x1,%eax
    5346:	48 8d 35 e0 61 00 00 	lea    0x61e0(%rip),%rsi        # b52d <_IO_stdin_used+0x52d>
    534d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5351:	e8 2a bd ff ff       	callq  1080 <__printf_chk@plt>
    5356:	41 83 fc 01          	cmp    $0x1,%r12d
    535a:	0f 8f be c6 ff ff    	jg     1a1e <main+0x95e>
    5360:	e9 e1 d5 ff ff       	jmpq   2946 <main+0x1886>
    5365:	48 8d 35 c4 1e 00 00 	lea    0x1ec4(%rip),%rsi        # 7230 <btstest>
    536c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5372:	4c 89 f7             	mov    %r14,%rdi
    5375:	e8 c6 56 00 00       	callq  aa40 <measureFunction>
    537a:	bf 01 00 00 00       	mov    $0x1,%edi
    537f:	b8 01 00 00 00       	mov    $0x1,%eax
    5384:	48 8d 35 bd 61 00 00 	lea    0x61bd(%rip),%rsi        # b548 <_IO_stdin_used+0x548>
    538b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    538f:	e8 ec bc ff ff       	callq  1080 <__printf_chk@plt>
    5394:	41 83 fc 01          	cmp    $0x1,%r12d
    5398:	0f 8f 31 c6 ff ff    	jg     19cf <main+0x90f>
    539e:	e9 a3 d5 ff ff       	jmpq   2946 <main+0x1886>
    53a3:	48 8d 35 34 1d 00 00 	lea    0x1d34(%rip),%rsi        # 70de <mixrormultest>
    53aa:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    53b0:	4c 89 f7             	mov    %r14,%rdi
    53b3:	e8 88 56 00 00       	callq  aa40 <measureFunction>
    53b8:	bf 01 00 00 00       	mov    $0x1,%edi
    53bd:	b8 01 00 00 00       	mov    $0x1,%eax
    53c2:	48 8d 35 92 61 00 00 	lea    0x6192(%rip),%rsi        # b55b <_IO_stdin_used+0x55b>
    53c9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    53cd:	e8 ae bc ff ff       	callq  1080 <__printf_chk@plt>
    53d2:	41 83 fc 01          	cmp    $0x1,%r12d
    53d6:	0f 8f a4 c5 ff ff    	jg     1980 <main+0x8c0>
    53dc:	e9 65 d5 ff ff       	jmpq   2946 <main+0x1886>
    53e1:	48 8d 35 6a 1c 00 00 	lea    0x1c6a(%rip),%rsi        # 7052 <mixrorshltest>
    53e8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    53ee:	4c 89 f7             	mov    %r14,%rdi
    53f1:	e8 4a 56 00 00       	callq  aa40 <measureFunction>
    53f6:	bf 01 00 00 00       	mov    $0x1,%edi
    53fb:	b8 01 00 00 00       	mov    $0x1,%eax
    5400:	48 8d 35 b1 6b 00 00 	lea    0x6bb1(%rip),%rsi        # bfb8 <_IO_stdin_used+0xfb8>
    5407:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    540b:	e8 70 bc ff ff       	callq  1080 <__printf_chk@plt>
    5410:	41 83 fc 01          	cmp    $0x1,%r12d
    5414:	0f 8f 17 c5 ff ff    	jg     1931 <main+0x871>
    541a:	e9 27 d5 ff ff       	jmpq   2946 <main+0x1886>
    541f:	48 8d 35 14 1b 00 00 	lea    0x1b14(%rip),%rsi        # 6f3a <rortest>
    5426:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    542c:	4c 89 f7             	mov    %r14,%rdi
    542f:	e8 0c 56 00 00       	callq  aa40 <measureFunction>
    5434:	bf 01 00 00 00       	mov    $0x1,%edi
    5439:	b8 01 00 00 00       	mov    $0x1,%eax
    543e:	48 8d 35 31 61 00 00 	lea    0x6131(%rip),%rsi        # b576 <_IO_stdin_used+0x576>
    5445:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5449:	e8 32 bc ff ff       	callq  1080 <__printf_chk@plt>
    544e:	41 83 fc 01          	cmp    $0x1,%r12d
    5452:	0f 8f 8a c4 ff ff    	jg     18e2 <main+0x822>
    5458:	e9 e9 d4 ff ff       	jmpq   2946 <main+0x1886>
    545d:	48 8d 35 62 1b 00 00 	lea    0x1b62(%rip),%rsi        # 6fc6 <shltest>
    5464:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    546a:	4c 89 f7             	mov    %r14,%rdi
    546d:	e8 ce 55 00 00       	callq  aa40 <measureFunction>
    5472:	bf 01 00 00 00       	mov    $0x1,%edi
    5477:	b8 01 00 00 00       	mov    $0x1,%eax
    547c:	48 8d 35 0a 61 00 00 	lea    0x610a(%rip),%rsi        # b58d <_IO_stdin_used+0x58d>
    5483:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5487:	e8 f4 bb ff ff       	callq  1080 <__printf_chk@plt>
    548c:	41 83 fc 01          	cmp    $0x1,%r12d
    5490:	0f 8f fd c3 ff ff    	jg     1893 <main+0x7d3>
    5496:	e9 ab d4 ff ff       	jmpq   2946 <main+0x1886>
    549b:	48 8d 35 42 4f 00 00 	lea    0x4f42(%rip),%rsi        # a3e4 <pdepmultest>
    54a2:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    54a8:	4c 89 f7             	mov    %r14,%rdi
    54ab:	e8 90 55 00 00       	callq  aa40 <measureFunction>
    54b0:	bf 01 00 00 00       	mov    $0x1,%edi
    54b5:	b8 01 00 00 00       	mov    $0x1,%eax
    54ba:	48 8d 35 e3 60 00 00 	lea    0x60e3(%rip),%rsi        # b5a4 <_IO_stdin_used+0x5a4>
    54c1:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    54c5:	e8 b6 bb ff ff       	callq  1080 <__printf_chk@plt>
    54ca:	41 83 fc 01          	cmp    $0x1,%r12d
    54ce:	0f 8f 70 c3 ff ff    	jg     1844 <main+0x784>
    54d4:	e9 6d d4 ff ff       	jmpq   2946 <main+0x1886>
    54d9:	48 8d 35 b3 4f 00 00 	lea    0x4fb3(%rip),%rsi        # a493 <pexttest>
    54e0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    54e6:	4c 89 f7             	mov    %r14,%rdi
    54e9:	e8 52 55 00 00       	callq  aa40 <measureFunction>
    54ee:	bf 01 00 00 00       	mov    $0x1,%edi
    54f3:	b8 01 00 00 00       	mov    $0x1,%eax
    54f8:	48 8d 35 c1 60 00 00 	lea    0x60c1(%rip),%rsi        # b5c0 <_IO_stdin_used+0x5c0>
    54ff:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5503:	e8 78 bb ff ff       	callq  1080 <__printf_chk@plt>
    5508:	41 83 fc 01          	cmp    $0x1,%r12d
    550c:	0f 8f e3 c2 ff ff    	jg     17f5 <main+0x735>
    5512:	e9 2f d4 ff ff       	jmpq   2946 <main+0x1886>
    5517:	48 8d 35 12 4e 00 00 	lea    0x4e12(%rip),%rsi        # a330 <pdeptest>
    551e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5524:	4c 89 f7             	mov    %r14,%rdi
    5527:	e8 14 55 00 00       	callq  aa40 <measureFunction>
    552c:	bf 01 00 00 00       	mov    $0x1,%edi
    5531:	b8 01 00 00 00       	mov    $0x1,%eax
    5536:	48 8d 35 97 60 00 00 	lea    0x6097(%rip),%rsi        # b5d4 <_IO_stdin_used+0x5d4>
    553d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5541:	e8 3a bb ff ff       	callq  1080 <__printf_chk@plt>
    5546:	41 83 fc 01          	cmp    $0x1,%r12d
    554a:	0f 8f 56 c2 ff ff    	jg     17a6 <main+0x6e6>
    5550:	e9 f1 d3 ff ff       	jmpq   2946 <main+0x1886>
    5555:	48 8d 35 6d 20 00 00 	lea    0x206d(%rip),%rsi        # 75c9 <ntjmptest>
    555c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5562:	4c 89 f7             	mov    %r14,%rdi
    5565:	e8 d6 54 00 00       	callq  aa40 <measureFunction>
    556a:	bf 01 00 00 00       	mov    $0x1,%edi
    556f:	b8 01 00 00 00       	mov    $0x1,%eax
    5574:	48 8d 35 6d 60 00 00 	lea    0x606d(%rip),%rsi        # b5e8 <_IO_stdin_used+0x5e8>
    557b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    557f:	e8 fc ba ff ff       	callq  1080 <__printf_chk@plt>
    5584:	41 83 fc 01          	cmp    $0x1,%r12d
    5588:	0f 8f c9 c1 ff ff    	jg     1757 <main+0x697>
    558e:	e9 b3 d3 ff ff       	jmpq   2946 <main+0x1886>
    5593:	48 8d 35 57 1f 00 00 	lea    0x1f57(%rip),%rsi        # 74f1 <jmptest>
    559a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    55a0:	4c 89 f7             	mov    %r14,%rdi
    55a3:	e8 98 54 00 00       	callq  aa40 <measureFunction>
    55a8:	bf 01 00 00 00       	mov    $0x1,%edi
    55ad:	b8 01 00 00 00       	mov    $0x1,%eax
    55b2:	48 8d 35 45 60 00 00 	lea    0x6045(%rip),%rsi        # b5fe <_IO_stdin_used+0x5fe>
    55b9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    55bd:	e8 be ba ff ff       	callq  1080 <__printf_chk@plt>
    55c2:	41 83 fc 01          	cmp    $0x1,%r12d
    55c6:	0f 8f 3c c1 ff ff    	jg     1708 <main+0x648>
    55cc:	e9 75 d3 ff ff       	jmpq   2946 <main+0x1886>
    55d1:	48 8d 35 0f 21 00 00 	lea    0x210f(%rip),%rsi        # 76e7 <jmpmultest>
    55d8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    55de:	4c 89 f7             	mov    %r14,%rdi
    55e1:	e8 5a 54 00 00       	callq  aa40 <measureFunction>
    55e6:	bf 01 00 00 00       	mov    $0x1,%edi
    55eb:	b8 01 00 00 00       	mov    $0x1,%eax
    55f0:	48 8d 35 20 60 00 00 	lea    0x6020(%rip),%rsi        # b617 <_IO_stdin_used+0x617>
    55f7:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    55fb:	e8 80 ba ff ff       	callq  1080 <__printf_chk@plt>
    5600:	41 83 fc 01          	cmp    $0x1,%r12d
    5604:	0f 8f af c0 ff ff    	jg     16b9 <main+0x5f9>
    560a:	e9 37 d3 ff ff       	jmpq   2946 <main+0x1886>
    560f:	48 8d 35 8e 21 00 00 	lea    0x218e(%rip),%rsi        # 77a4 <addmultest>
    5616:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    561c:	4c 89 f7             	mov    %r14,%rdi
    561f:	e8 1c 54 00 00       	callq  aa40 <measureFunction>
    5624:	bf 01 00 00 00       	mov    $0x1,%edi
    5629:	b8 01 00 00 00       	mov    $0x1,%eax
    562e:	48 8d 35 fd 5f 00 00 	lea    0x5ffd(%rip),%rsi        # b632 <_IO_stdin_used+0x632>
    5635:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5639:	e8 42 ba ff ff       	callq  1080 <__printf_chk@plt>
    563e:	41 83 fc 01          	cmp    $0x1,%r12d
    5642:	0f 8f 22 c0 ff ff    	jg     166a <main+0x5aa>
    5648:	e9 f9 d2 ff ff       	jmpq   2946 <main+0x1886>
    564d:	48 8d 35 52 16 00 00 	lea    0x1652(%rip),%rsi        # 6ca6 <clkmovtest>
    5654:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    565a:	4c 89 f7             	mov    %r14,%rdi
    565d:	e8 de 53 00 00       	callq  aa40 <measureFunction>
    5662:	bf 01 00 00 00       	mov    $0x1,%edi
    5667:	b8 01 00 00 00       	mov    $0x1,%eax
    566c:	48 8d 35 65 69 00 00 	lea    0x6965(%rip),%rsi        # bfd8 <_IO_stdin_used+0xfd8>
    5673:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5677:	e8 04 ba ff ff       	callq  1080 <__printf_chk@plt>
    567c:	41 83 fc 01          	cmp    $0x1,%r12d
    5680:	0f 8f 95 bf ff ff    	jg     161b <main+0x55b>
    5686:	e9 bb d2 ff ff       	jmpq   2946 <main+0x1886>
    568b:	48 8d 35 5c 52 00 00 	lea    0x525c(%rip),%rsi        # a8ee <depdectest>
    5692:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5698:	4c 89 f7             	mov    %r14,%rdi
    569b:	e8 a0 53 00 00       	callq  aa40 <measureFunction>
    56a0:	bf 01 00 00 00       	mov    $0x1,%edi
    56a5:	b8 01 00 00 00       	mov    $0x1,%eax
    56aa:	48 8d 35 9e 5f 00 00 	lea    0x5f9e(%rip),%rsi        # b64f <_IO_stdin_used+0x64f>
    56b1:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    56b5:	e8 c6 b9 ff ff       	callq  1080 <__printf_chk@plt>
    56ba:	41 83 fc 01          	cmp    $0x1,%r12d
    56be:	0f 8f bd be ff ff    	jg     1581 <main+0x4c1>
    56c4:	e9 7d d2 ff ff       	jmpq   2946 <main+0x1886>
    56c9:	48 8d 35 a7 51 00 00 	lea    0x51a7(%rip),%rsi        # a877 <depinctest>
    56d0:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    56d6:	4c 89 f7             	mov    %r14,%rdi
    56d9:	e8 62 53 00 00       	callq  aa40 <measureFunction>
    56de:	bf 01 00 00 00       	mov    $0x1,%edi
    56e3:	b8 01 00 00 00       	mov    $0x1,%eax
    56e8:	48 8d 35 77 5f 00 00 	lea    0x5f77(%rip),%rsi        # b666 <_IO_stdin_used+0x666>
    56ef:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    56f3:	e8 88 b9 ff ff       	callq  1080 <__printf_chk@plt>
    56f8:	41 83 fc 01          	cmp    $0x1,%r12d
    56fc:	0f 8f 30 be ff ff    	jg     1532 <main+0x472>
    5702:	e9 3f d2 ff ff       	jmpq   2946 <main+0x1886>
    5707:	48 8d 35 67 50 00 00 	lea    0x5067(%rip),%rsi        # a775 <subzerotest>
    570e:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5714:	4c 89 f7             	mov    %r14,%rdi
    5717:	e8 24 53 00 00       	callq  aa40 <measureFunction>
    571c:	bf 01 00 00 00       	mov    $0x1,%edi
    5721:	b8 01 00 00 00       	mov    $0x1,%eax
    5726:	48 8d 35 50 5f 00 00 	lea    0x5f50(%rip),%rsi        # b67d <_IO_stdin_used+0x67d>
    572d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5731:	e8 4a b9 ff ff       	callq  1080 <__printf_chk@plt>
    5736:	41 83 fc 01          	cmp    $0x1,%r12d
    573a:	0f 8f a3 bd ff ff    	jg     14e3 <main+0x423>
    5740:	e9 01 d2 ff ff       	jmpq   2946 <main+0x1886>
    5745:	48 8d 35 e7 4e 00 00 	lea    0x4ee7(%rip),%rsi        # a633 <movzerotest>
    574c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5752:	4c 89 f7             	mov    %r14,%rdi
    5755:	e8 e6 52 00 00       	callq  aa40 <measureFunction>
    575a:	bf 01 00 00 00       	mov    $0x1,%edi
    575f:	b8 01 00 00 00       	mov    $0x1,%eax
    5764:	48 8d 35 2a 5f 00 00 	lea    0x5f2a(%rip),%rsi        # b695 <_IO_stdin_used+0x695>
    576b:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    576f:	e8 0c b9 ff ff       	callq  1080 <__printf_chk@plt>
    5774:	41 83 fc 01          	cmp    $0x1,%r12d
    5778:	0f 8f 16 bd ff ff    	jg     1494 <main+0x3d4>
    577e:	e9 c3 d1 ff ff       	jmpq   2946 <main+0x1886>
    5783:	48 8d 35 74 4f 00 00 	lea    0x4f74(%rip),%rsi        # a6fe <xorzerotest>
    578a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5790:	4c 89 f7             	mov    %r14,%rdi
    5793:	e8 a8 52 00 00       	callq  aa40 <measureFunction>
    5798:	bf 01 00 00 00       	mov    $0x1,%edi
    579d:	b8 01 00 00 00       	mov    $0x1,%eax
    57a2:	48 8d 35 04 5f 00 00 	lea    0x5f04(%rip),%rsi        # b6ad <_IO_stdin_used+0x6ad>
    57a9:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    57ad:	e8 ce b8 ff ff       	callq  1080 <__printf_chk@plt>
    57b2:	41 83 fc 01          	cmp    $0x1,%r12d
    57b6:	0f 8f 89 bc ff ff    	jg     1445 <main+0x385>
    57bc:	e9 85 d1 ff ff       	jmpq   2946 <main+0x1886>
    57c1:	48 8d 35 f4 4d 00 00 	lea    0x4df4(%rip),%rsi        # a5bc <indepmovtest>
    57c8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    57ce:	4c 89 f7             	mov    %r14,%rdi
    57d1:	e8 6a 52 00 00       	callq  aa40 <measureFunction>
    57d6:	bf 01 00 00 00       	mov    $0x1,%edi
    57db:	b8 01 00 00 00       	mov    $0x1,%eax
    57e0:	48 8d 35 41 68 00 00 	lea    0x6841(%rip),%rsi        # c028 <_IO_stdin_used+0x1028>
    57e7:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    57eb:	e8 90 b8 ff ff       	callq  1080 <__printf_chk@plt>
    57f0:	41 83 fc 01          	cmp    $0x1,%r12d
    57f4:	0f 8f fc bb ff ff    	jg     13f6 <main+0x336>
    57fa:	e9 47 d1 ff ff       	jmpq   2946 <main+0x1886>
    57ff:	48 8d 35 41 4d 00 00 	lea    0x4d41(%rip),%rsi        # a547 <depmovtest>
    5806:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    580c:	4c 89 f7             	mov    %r14,%rdi
    580f:	e8 2c 52 00 00       	callq  aa40 <measureFunction>
    5814:	bf 01 00 00 00       	mov    $0x1,%edi
    5819:	b8 01 00 00 00       	mov    $0x1,%eax
    581e:	48 8d 35 a0 5e 00 00 	lea    0x5ea0(%rip),%rsi        # b6c5 <_IO_stdin_used+0x6c5>
    5825:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5829:	e8 52 b8 ff ff       	callq  1080 <__printf_chk@plt>
    582e:	41 83 fc 01          	cmp    $0x1,%r12d
    5832:	0f 8f 6f bb ff ff    	jg     13a7 <main+0x2e7>
    5838:	e9 09 d1 ff ff       	jmpq   2946 <main+0x1886>
    583d:	48 8d 35 e6 15 00 00 	lea    0x15e6(%rip),%rsi        # 6e2a <addnoptest>
    5844:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    584a:	4c 89 f7             	mov    %r14,%rdi
    584d:	e8 ee 51 00 00       	callq  aa40 <measureFunction>
    5852:	bf 01 00 00 00       	mov    $0x1,%edi
    5857:	b8 01 00 00 00       	mov    $0x1,%eax
    585c:	48 8d 35 80 5e 00 00 	lea    0x5e80(%rip),%rsi        # b6e3 <_IO_stdin_used+0x6e3>
    5863:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5867:	e8 14 b8 ff ff       	callq  1080 <__printf_chk@plt>
    586c:	41 83 fc 01          	cmp    $0x1,%r12d
    5870:	0f 8f e2 ba ff ff    	jg     1358 <main+0x298>
    5876:	e9 cb d0 ff ff       	jmpq   2946 <main+0x1886>
    587b:	4c 8d 3d a8 15 00 00 	lea    0x15a8(%rip),%r15        # 6e2a <addnoptest>
    5882:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5888:	4c 89 f7             	mov    %r14,%rdi
    588b:	4c 89 fe             	mov    %r15,%rsi
    588e:	e8 ad 51 00 00       	callq  aa40 <measureFunction>
    5893:	bf 01 00 00 00       	mov    $0x1,%edi
    5898:	b8 01 00 00 00       	mov    $0x1,%eax
    589d:	48 8d 35 5c 5e 00 00 	lea    0x5e5c(%rip),%rsi        # b700 <_IO_stdin_used+0x700>
    58a4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    58a8:	e8 d3 b7 ff ff       	callq  1080 <__printf_chk@plt>
    58ad:	41 83 fc 01          	cmp    $0x1,%r12d
    58b1:	0f 8f 5b ba ff ff    	jg     1312 <main+0x252>
    58b7:	e9 8a d0 ff ff       	jmpq   2946 <main+0x1886>
    58bc:	48 8d 35 db 14 00 00 	lea    0x14db(%rip),%rsi        # 6d9e <addtest>
    58c3:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    58c9:	4c 89 f7             	mov    %r14,%rdi
    58cc:	e8 6f 51 00 00       	callq  aa40 <measureFunction>
    58d1:	bf 01 00 00 00       	mov    $0x1,%edi
    58d6:	b8 01 00 00 00       	mov    $0x1,%eax
    58db:	48 8d 35 3b 5e 00 00 	lea    0x5e3b(%rip),%rsi        # b71d <_IO_stdin_used+0x71d>
    58e2:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    58e6:	e8 95 b7 ff ff       	callq  1080 <__printf_chk@plt>
    58eb:	41 83 fc 01          	cmp    $0x1,%r12d
    58ef:	0f 8f d0 b9 ff ff    	jg     12c5 <main+0x205>
    58f5:	e9 4c d0 ff ff       	jmpq   2946 <main+0x1886>
    58fa:	48 8d 35 3e 14 00 00 	lea    0x143e(%rip),%rsi        # 6d3f <noptest>
    5901:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5907:	4c 89 f7             	mov    %r14,%rdi
    590a:	e8 31 51 00 00       	callq  aa40 <measureFunction>
    590f:	bf 01 00 00 00       	mov    $0x1,%edi
    5914:	b8 01 00 00 00       	mov    $0x1,%eax
    5919:	48 8d 35 11 5e 00 00 	lea    0x5e11(%rip),%rsi        # b731 <_IO_stdin_used+0x731>
    5920:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5924:	e8 57 b7 ff ff       	callq  1080 <__printf_chk@plt>
    5929:	41 83 fc 01          	cmp    $0x1,%r12d
    592d:	0f 8f 43 b9 ff ff    	jg     1276 <main+0x1b6>
    5933:	e9 0e d0 ff ff       	jmpq   2946 <main+0x1886>
    5938:	45 31 ed             	xor    %r13d,%r13d
    593b:	48 8d 74 24 10       	lea    0x10(%rsp),%rsi
    5940:	48 8d 7c 24 20       	lea    0x20(%rsp),%rdi
    5945:	e8 06 b7 ff ff       	callq  1050 <gettimeofday@plt>
    594a:	4c 89 f7             	mov    %r14,%rdi
    594d:	e8 f7 12 00 00       	callq  6c49 <clktest>
    5952:	48 8d 74 24 18       	lea    0x18(%rsp),%rsi
    5957:	48 8d 7c 24 30       	lea    0x30(%rsp),%rdi
    595c:	e8 ef b6 ff ff       	callq  1050 <gettimeofday@plt>
    5961:	48 8b 44 24 38       	mov    0x38(%rsp),%rax
    5966:	48 2b 44 24 28       	sub    0x28(%rsp),%rax
    596b:	be e8 03 00 00       	mov    $0x3e8,%esi
    5970:	48 99                	cqto   
    5972:	48 8b 4c 24 30       	mov    0x30(%rsp),%rcx
    5977:	48 2b 4c 24 20       	sub    0x20(%rsp),%rcx
    597c:	48 f7 fe             	idiv   %rsi
    597f:	48 69 c9 e8 03 00 00 	imul   $0x3e8,%rcx,%rcx
    5986:	48 01 c1             	add    %rax,%rcx
    5989:	0f 88 03 0b 00 00    	js     6492 <main+0x53d2>
    598f:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5993:	f3 48 0f 2a c1       	cvtsi2ss %rcx,%xmm0
    5998:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    599c:	f2 0f 59 05 d4 66 00 	mulsd  0x66d4(%rip),%xmm0        # c078 <_IO_stdin_used+0x1078>
    59a3:	00 
    59a4:	4d 85 f6             	test   %r14,%r14
    59a7:	0f 88 cd 0a 00 00    	js     647a <main+0x53ba>
    59ad:	66 0f ef c9          	pxor   %xmm1,%xmm1
    59b1:	f3 49 0f 2a ce       	cvtsi2ss %r14,%xmm1
    59b6:	f3 0f 5a c9          	cvtss2sd %xmm1,%xmm1
    59ba:	f2 0f 5e c1          	divsd  %xmm1,%xmm0
    59be:	bf 01 00 00 00       	mov    $0x1,%edi
    59c3:	f3 0f 10 2d bd 66 00 	movss  0x66bd(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>
    59ca:	00 
    59cb:	48 8d 35 a6 5d 00 00 	lea    0x5da6(%rip),%rsi        # b778 <_IO_stdin_used+0x778>
    59d2:	b8 01 00 00 00       	mov    $0x1,%eax
    59d7:	f3 0f 11 6c 24 0c    	movss  %xmm5,0xc(%rsp)
    59dd:	f2 0f 5a c0          	cvtsd2ss %xmm0,%xmm0
    59e1:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    59e5:	66 0f ef c0          	pxor   %xmm0,%xmm0
    59e9:	f3 0f 11 6c 24 08    	movss  %xmm5,0x8(%rsp)
    59ef:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    59f3:	e8 88 b6 ff ff       	callq  1080 <__printf_chk@plt>
    59f8:	45 85 ed             	test   %r13d,%r13d
    59fb:	0f 84 8f de ff ff    	je     3890 <main+0x27d0>
    5a01:	48 8d 35 ef 31 00 00 	lea    0x31ef(%rip),%rsi        # 8bf7 <fma512>
    5a08:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5a0e:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5a13:	e8 28 50 00 00       	callq  aa40 <measureFunction>
    5a18:	bf 01 00 00 00       	mov    $0x1,%edi
    5a1d:	b8 01 00 00 00       	mov    $0x1,%eax
    5a22:	48 8d 35 34 56 00 00 	lea    0x5634(%rip),%rsi        # b05d <_IO_stdin_used+0x5d>
    5a29:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5a2d:	e8 4e b6 ff ff       	callq  1080 <__printf_chk@plt>
    5a32:	48 8d 35 ff 3b 00 00 	lea    0x3bff(%rip),%rsi        # 9638 <latfma512>
    5a39:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5a3f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5a44:	e8 f7 4f 00 00       	callq  aa40 <measureFunction>
    5a49:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    5a4f:	bf 01 00 00 00       	mov    $0x1,%edi
    5a54:	48 8d 35 45 5d 00 00 	lea    0x5d45(%rip),%rsi        # b7a0 <_IO_stdin_used+0x7a0>
    5a5b:	b8 01 00 00 00       	mov    $0x1,%eax
    5a60:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    5a64:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5a68:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    5a6c:	e8 0f b6 ff ff       	callq  1080 <__printf_chk@plt>
    5a71:	48 8d 35 60 32 00 00 	lea    0x3260(%rip),%rsi        # 8cd8 <mixfma256fma512>
    5a78:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5a7e:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5a83:	e8 b8 4f 00 00       	callq  aa40 <measureFunction>
    5a88:	bf 01 00 00 00       	mov    $0x1,%edi
    5a8d:	b8 01 00 00 00       	mov    $0x1,%eax
    5a92:	48 8d 35 2f 5d 00 00 	lea    0x5d2f(%rip),%rsi        # b7c8 <_IO_stdin_used+0x7c8>
    5a99:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5a9d:	e8 de b5 ff ff       	callq  1080 <__printf_chk@plt>
    5aa2:	48 8d 35 20 3a 00 00 	lea    0x3a20(%rip),%rsi        # 94c9 <nemesfpu512mix21>
    5aa9:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5aaf:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    5ab4:	48 c1 e7 09          	shl    $0x9,%rdi
    5ab8:	e8 83 4f 00 00       	callq  aa40 <measureFunction>
    5abd:	bf 01 00 00 00       	mov    $0x1,%edi
    5ac2:	b8 01 00 00 00       	mov    $0x1,%eax
    5ac7:	48 8d 35 22 5d 00 00 	lea    0x5d22(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>
    5ace:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5ad2:	e8 a9 b5 ff ff       	callq  1080 <__printf_chk@plt>
    5ad7:	48 8d 35 af 1f 00 00 	lea    0x1faf(%rip),%rsi        # 7a8d <add512int>
    5ade:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5ae4:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5ae9:	e8 52 4f 00 00       	callq  aa40 <measureFunction>
    5aee:	bf 01 00 00 00       	mov    $0x1,%edi
    5af3:	b8 01 00 00 00       	mov    $0x1,%eax
    5af8:	48 8d 35 21 5d 00 00 	lea    0x5d21(%rip),%rsi        # b820 <_IO_stdin_used+0x820>
    5aff:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5b03:	e8 78 b5 ff ff       	callq  1080 <__printf_chk@plt>
    5b08:	48 8d 35 a1 23 00 00 	lea    0x23a1(%rip),%rsi        # 7eb0 <latadd256int>
    5b0f:	4c 89 f7             	mov    %r14,%rdi
    5b12:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5b18:	e8 23 4f 00 00       	callq  aa40 <measureFunction>
    5b1d:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    5b23:	bf 01 00 00 00       	mov    $0x1,%edi
    5b28:	48 8d 35 11 5d 00 00 	lea    0x5d11(%rip),%rsi        # b840 <_IO_stdin_used+0x840>
    5b2f:	b8 01 00 00 00       	mov    $0x1,%eax
    5b34:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    5b38:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5b3c:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    5b40:	e8 3b b5 ff ff       	callq  1080 <__printf_chk@plt>
    5b45:	48 8d 35 c5 1d 00 00 	lea    0x1dc5(%rip),%rsi        # 7911 <mul512int>
    5b4c:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5b52:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5b57:	e8 e4 4e 00 00       	callq  aa40 <measureFunction>
    5b5c:	bf 01 00 00 00       	mov    $0x1,%edi
    5b61:	b8 01 00 00 00       	mov    $0x1,%eax
    5b66:	48 8d 35 fb 5c 00 00 	lea    0x5cfb(%rip),%rsi        # b868 <_IO_stdin_used+0x868>
    5b6d:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5b71:	e8 0a b5 ff ff       	callq  1080 <__printf_chk@plt>
    5b76:	48 8d 35 52 1e 00 00 	lea    0x1e52(%rip),%rsi        # 79cf <muldq512int>
    5b7d:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5b83:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5b88:	e8 b3 4e 00 00       	callq  aa40 <measureFunction>
    5b8d:	bf 01 00 00 00       	mov    $0x1,%edi
    5b92:	b8 01 00 00 00       	mov    $0x1,%eax
    5b97:	48 8d 35 f2 5c 00 00 	lea    0x5cf2(%rip),%rsi        # b890 <_IO_stdin_used+0x890>
    5b9e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5ba2:	e8 d9 b4 ff ff       	callq  1080 <__printf_chk@plt>
    5ba7:	48 8d 35 1b 26 00 00 	lea    0x261b(%rip),%rsi        # 81c9 <latmulq512int>
    5bae:	4c 89 f7             	mov    %r14,%rdi
    5bb1:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5bb7:	e8 84 4e 00 00       	callq  aa40 <measureFunction>
    5bbc:	f3 0f 10 6c 24 0c    	movss  0xc(%rsp),%xmm5
    5bc2:	bf 01 00 00 00       	mov    $0x1,%edi
    5bc7:	48 8d 35 f2 5c 00 00 	lea    0x5cf2(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>
    5bce:	b8 01 00 00 00       	mov    $0x1,%eax
    5bd3:	f3 0f 5e e8          	divss  %xmm0,%xmm5
    5bd7:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5bdb:	f3 0f 5a c5          	cvtss2sd %xmm5,%xmm0
    5bdf:	e8 9c b4 ff ff       	callq  1080 <__printf_chk@plt>
    5be4:	48 8d 35 3a 24 00 00 	lea    0x243a(%rip),%rsi        # 8025 <latmul512int>
    5beb:	4c 89 f7             	mov    %r14,%rdi
    5bee:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5bf4:	e8 47 4e 00 00       	callq  aa40 <measureFunction>
    5bf9:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    5bff:	bf 01 00 00 00       	mov    $0x1,%edi
    5c04:	48 8d 35 e5 5c 00 00 	lea    0x5ce5(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>
    5c0b:	b8 01 00 00 00       	mov    $0x1,%eax
    5c10:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    5c14:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5c18:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    5c1c:	e8 5f b4 ff ff       	callq  1080 <__printf_chk@plt>
    5c21:	48 8d 35 cf 24 00 00 	lea    0x24cf(%rip),%rsi        # 80f7 <latmuldq512int>
    5c28:	4c 89 f7             	mov    %r14,%rdi
    5c2b:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5c31:	e8 0a 4e 00 00       	callq  aa40 <measureFunction>
    5c36:	f3 0f 10 7c 24 0c    	movss  0xc(%rsp),%xmm7
    5c3c:	bf 01 00 00 00       	mov    $0x1,%edi
    5c41:	48 8d 35 d8 5c 00 00 	lea    0x5cd8(%rip),%rsi        # b920 <_IO_stdin_used+0x920>
    5c48:	b8 01 00 00 00       	mov    $0x1,%eax
    5c4d:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    5c51:	66 0f ef c0          	pxor   %xmm0,%xmm0
    5c55:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    5c59:	e8 22 b4 ff ff       	callq  1080 <__printf_chk@plt>
    5c5e:	48 8d 35 bd 33 00 00 	lea    0x33bd(%rip),%rsi        # 9022 <mixfmaadd512>
    5c65:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5c6b:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    5c70:	48 c1 e7 09          	shl    $0x9,%rdi
    5c74:	e8 c7 4d 00 00       	callq  aa40 <measureFunction>
    5c79:	bf 01 00 00 00       	mov    $0x1,%edi
    5c7e:	b8 01 00 00 00       	mov    $0x1,%eax
    5c83:	48 8d 35 ce 5c 00 00 	lea    0x5cce(%rip),%rsi        # b958 <_IO_stdin_used+0x958>
    5c8a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5c8e:	e8 ed b3 ff ff       	callq  1080 <__printf_chk@plt>
    5c93:	48 8d 35 69 34 00 00 	lea    0x3469(%rip),%rsi        # 9103 <mixfma512add256>
    5c9a:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5ca0:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    5ca5:	48 c1 e7 09          	shl    $0x9,%rdi
    5ca9:	e8 92 4d 00 00       	callq  aa40 <measureFunction>
    5cae:	bf 01 00 00 00       	mov    $0x1,%edi
    5cb3:	b8 01 00 00 00       	mov    $0x1,%eax
    5cb8:	48 8d 35 c1 5c 00 00 	lea    0x5cc1(%rip),%rsi        # b980 <_IO_stdin_used+0x980>
    5cbf:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5cc3:	e8 b8 b3 ff ff       	callq  1080 <__printf_chk@plt>
    5cc8:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5cce:	48 8d 35 db 4c 00 00 	lea    0x4cdb(%rip),%rsi        # a9b0 <load512wrapper>
    5cd5:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5cda:	e8 61 4d 00 00       	callq  aa40 <measureFunction>
    5cdf:	bf 01 00 00 00       	mov    $0x1,%edi
    5ce4:	b8 01 00 00 00       	mov    $0x1,%eax
    5ce9:	48 8d 35 88 53 00 00 	lea    0x5388(%rip),%rsi        # b078 <_IO_stdin_used+0x78>
    5cf0:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5cf4:	e8 87 b3 ff ff       	callq  1080 <__printf_chk@plt>
    5cf9:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5cff:	48 8d 35 fa 4c 00 00 	lea    0x4cfa(%rip),%rsi        # aa00 <store512wrapper>
    5d06:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5d0b:	e8 30 4d 00 00       	callq  aa40 <measureFunction>
    5d10:	bf 01 00 00 00       	mov    $0x1,%edi
    5d15:	b8 01 00 00 00       	mov    $0x1,%eax
    5d1a:	48 8d 35 74 53 00 00 	lea    0x5374(%rip),%rsi        # b095 <_IO_stdin_used+0x95>
    5d21:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5d25:	e8 56 b3 ff ff       	callq  1080 <__printf_chk@plt>
    5d2a:	48 8d 35 ee 26 00 00 	lea    0x26ee(%rip),%rsi        # 841f <aesenc128>
    5d31:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5d37:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5d3c:	e8 ff 4c 00 00       	callq  aa40 <measureFunction>
    5d41:	bf 01 00 00 00       	mov    $0x1,%edi
    5d46:	b8 01 00 00 00       	mov    $0x1,%eax
    5d4b:	48 8d 35 61 53 00 00 	lea    0x5361(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>
    5d52:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5d56:	e8 25 b3 ff ff       	callq  1080 <__printf_chk@plt>
    5d5b:	48 8d 35 21 2a 00 00 	lea    0x2a21(%rip),%rsi        # 8783 <aesdec128>
    5d62:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5d68:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5d6d:	e8 ce 4c 00 00       	callq  aa40 <measureFunction>
    5d72:	bf 01 00 00 00       	mov    $0x1,%edi
    5d77:	b8 01 00 00 00       	mov    $0x1,%eax
    5d7c:	48 8d 35 46 53 00 00 	lea    0x5346(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>
    5d83:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5d87:	e8 f4 b2 ff ff       	callq  1080 <__printf_chk@plt>
    5d8c:	48 8d 35 26 27 00 00 	lea    0x2726(%rip),%rsi        # 84b9 <aesencadd128>
    5d93:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5d99:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5d9e:	e8 9d 4c 00 00       	callq  aa40 <measureFunction>
    5da3:	bf 01 00 00 00       	mov    $0x1,%edi
    5da8:	b8 01 00 00 00       	mov    $0x1,%eax
    5dad:	48 8d 35 fc 5b 00 00 	lea    0x5bfc(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>
    5db4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5db8:	e8 c3 b2 ff ff       	callq  1080 <__printf_chk@plt>
    5dbd:	48 8d 35 af 27 00 00 	lea    0x27af(%rip),%rsi        # 8573 <aesencfma128>
    5dc4:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5dca:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5dcf:	e8 6c 4c 00 00       	callq  aa40 <measureFunction>
    5dd4:	bf 01 00 00 00       	mov    $0x1,%edi
    5dd9:	b8 01 00 00 00       	mov    $0x1,%eax
    5dde:	48 8d 35 fa 52 00 00 	lea    0x52fa(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>
    5de5:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5de9:	e8 92 b2 ff ff       	callq  1080 <__printf_chk@plt>
    5dee:	48 8d 35 f8 28 00 00 	lea    0x28f8(%rip),%rsi        # 86ed <aesencmul128>
    5df5:	f3 0f 10 44 24 08    	movss  0x8(%rsp),%xmm0
    5dfb:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5e00:	e8 3b 4c 00 00       	callq  aa40 <measureFunction>
    5e05:	bf 01 00 00 00       	mov    $0x1,%edi
    5e0a:	b8 01 00 00 00       	mov    $0x1,%eax
    5e0f:	48 8d 35 ba 5b 00 00 	lea    0x5bba(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>
    5e16:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5e1a:	e8 61 b2 ff ff       	callq  1080 <__printf_chk@plt>
    5e1f:	e9 31 da ff ff       	jmpq   3855 <main+0x2795>
    5e24:	ba 0c 00 00 00       	mov    $0xc,%edx
    5e29:	48 8d 35 88 53 00 00 	lea    0x5388(%rip),%rsi        # b1b8 <_IO_stdin_used+0x1b8>
    5e30:	4c 89 ef             	mov    %r13,%rdi
    5e33:	e8 f8 b1 ff ff       	callq  1030 <strncmp@plt>
    5e38:	85 c0                	test   %eax,%eax
    5e3a:	0f 85 2b cb ff ff    	jne    296b <main+0x18ab>
    5e40:	48 8d 35 a6 28 00 00 	lea    0x28a6(%rip),%rsi        # 86ed <aesencmul128>
    5e47:	f3 0f 10 05 39 62 00 	movss  0x6239(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    5e4e:	00 
    5e4f:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5e54:	e8 e7 4b 00 00       	callq  aa40 <measureFunction>
    5e59:	bf 01 00 00 00       	mov    $0x1,%edi
    5e5e:	b8 01 00 00 00       	mov    $0x1,%eax
    5e63:	48 8d 35 66 5b 00 00 	lea    0x5b66(%rip),%rsi        # b9d0 <_IO_stdin_used+0x9d0>
    5e6a:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5e6e:	e8 0d b2 ff ff       	callq  1080 <__printf_chk@plt>
    5e73:	41 83 fc 01          	cmp    $0x1,%r12d
    5e77:	0f 8f a4 d9 ff ff    	jg     3821 <main+0x2761>
    5e7d:	e9 c4 ca ff ff       	jmpq   2946 <main+0x1886>
    5e82:	ba 0c 00 00 00       	mov    $0xc,%edx
    5e87:	48 8d 35 1d 53 00 00 	lea    0x531d(%rip),%rsi        # b1ab <_IO_stdin_used+0x1ab>
    5e8e:	4c 89 ef             	mov    %r13,%rdi
    5e91:	e8 9a b1 ff ff       	callq  1030 <strncmp@plt>
    5e96:	85 c0                	test   %eax,%eax
    5e98:	75 8a                	jne    5e24 <main+0x4d64>
    5e9a:	48 8d 35 d2 26 00 00 	lea    0x26d2(%rip),%rsi        # 8573 <aesencfma128>
    5ea1:	f3 0f 10 05 df 61 00 	movss  0x61df(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    5ea8:	00 
    5ea9:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5eae:	e8 8d 4b 00 00       	callq  aa40 <measureFunction>
    5eb3:	bf 01 00 00 00       	mov    $0x1,%edi
    5eb8:	b8 01 00 00 00       	mov    $0x1,%eax
    5ebd:	48 8d 35 1b 52 00 00 	lea    0x521b(%rip),%rsi        # b0df <_IO_stdin_used+0xdf>
    5ec4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5ec8:	e8 b3 b1 ff ff       	callq  1080 <__printf_chk@plt>
    5ecd:	41 83 fc 01          	cmp    $0x1,%r12d
    5ed1:	0f 8f f7 d8 ff ff    	jg     37ce <main+0x270e>
    5ed7:	e9 6a ca ff ff       	jmpq   2946 <main+0x1886>
    5edc:	ba 0c 00 00 00       	mov    $0xc,%edx
    5ee1:	48 8d 35 b6 52 00 00 	lea    0x52b6(%rip),%rsi        # b19e <_IO_stdin_used+0x19e>
    5ee8:	4c 89 ef             	mov    %r13,%rdi
    5eeb:	e8 40 b1 ff ff       	callq  1030 <strncmp@plt>
    5ef0:	85 c0                	test   %eax,%eax
    5ef2:	75 8e                	jne    5e82 <main+0x4dc2>
    5ef4:	48 8d 35 be 25 00 00 	lea    0x25be(%rip),%rsi        # 84b9 <aesencadd128>
    5efb:	f3 0f 10 05 85 61 00 	movss  0x6185(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    5f02:	00 
    5f03:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5f08:	e8 33 4b 00 00       	callq  aa40 <measureFunction>
    5f0d:	bf 01 00 00 00       	mov    $0x1,%edi
    5f12:	b8 01 00 00 00       	mov    $0x1,%eax
    5f17:	48 8d 35 92 5a 00 00 	lea    0x5a92(%rip),%rsi        # b9b0 <_IO_stdin_used+0x9b0>
    5f1e:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5f22:	e8 59 b1 ff ff       	callq  1080 <__printf_chk@plt>
    5f27:	41 83 fc 01          	cmp    $0x1,%r12d
    5f2b:	0f 8f 4a d8 ff ff    	jg     377b <main+0x26bb>
    5f31:	e9 10 ca ff ff       	jmpq   2946 <main+0x1886>
    5f36:	ba 09 00 00 00       	mov    $0x9,%edx
    5f3b:	48 8d 35 52 52 00 00 	lea    0x5252(%rip),%rsi        # b194 <_IO_stdin_used+0x194>
    5f42:	4c 89 ef             	mov    %r13,%rdi
    5f45:	e8 e6 b0 ff ff       	callq  1030 <strncmp@plt>
    5f4a:	85 c0                	test   %eax,%eax
    5f4c:	75 8e                	jne    5edc <main+0x4e1c>
    5f4e:	48 8d 35 2e 28 00 00 	lea    0x282e(%rip),%rsi        # 8783 <aesdec128>
    5f55:	f3 0f 10 05 2b 61 00 	movss  0x612b(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    5f5c:	00 
    5f5d:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5f62:	e8 d9 4a 00 00       	callq  aa40 <measureFunction>
    5f67:	bf 01 00 00 00       	mov    $0x1,%edi
    5f6c:	b8 01 00 00 00       	mov    $0x1,%eax
    5f71:	48 8d 35 51 51 00 00 	lea    0x5151(%rip),%rsi        # b0c9 <_IO_stdin_used+0xc9>
    5f78:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5f7c:	e8 ff b0 ff ff       	callq  1080 <__printf_chk@plt>
    5f81:	41 83 fc 01          	cmp    $0x1,%r12d
    5f85:	0f 8f 9d d7 ff ff    	jg     3728 <main+0x2668>
    5f8b:	e9 b6 c9 ff ff       	jmpq   2946 <main+0x1886>
    5f90:	ba 09 00 00 00       	mov    $0x9,%edx
    5f95:	48 8d 35 ee 51 00 00 	lea    0x51ee(%rip),%rsi        # b18a <_IO_stdin_used+0x18a>
    5f9c:	4c 89 ef             	mov    %r13,%rdi
    5f9f:	e8 8c b0 ff ff       	callq  1030 <strncmp@plt>
    5fa4:	85 c0                	test   %eax,%eax
    5fa6:	75 8e                	jne    5f36 <main+0x4e76>
    5fa8:	48 8d 35 70 24 00 00 	lea    0x2470(%rip),%rsi        # 841f <aesenc128>
    5faf:	f3 0f 10 05 d1 60 00 	movss  0x60d1(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    5fb6:	00 
    5fb7:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    5fbc:	e8 7f 4a 00 00       	callq  aa40 <measureFunction>
    5fc1:	bf 01 00 00 00       	mov    $0x1,%edi
    5fc6:	b8 01 00 00 00       	mov    $0x1,%eax
    5fcb:	48 8d 35 e1 50 00 00 	lea    0x50e1(%rip),%rsi        # b0b3 <_IO_stdin_used+0xb3>
    5fd2:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    5fd6:	e8 a5 b0 ff ff       	callq  1080 <__printf_chk@plt>
    5fdb:	41 83 fc 01          	cmp    $0x1,%r12d
    5fdf:	0f 8f f0 d6 ff ff    	jg     36d5 <main+0x2615>
    5fe5:	e9 5c c9 ff ff       	jmpq   2946 <main+0x1886>
    5fea:	ba 07 00 00 00       	mov    $0x7,%edx
    5fef:	48 8d 35 8b 51 00 00 	lea    0x518b(%rip),%rsi        # b181 <_IO_stdin_used+0x181>
    5ff6:	4c 89 ef             	mov    %r13,%rdi
    5ff9:	e8 32 b0 ff ff       	callq  1030 <strncmp@plt>
    5ffe:	85 c0                	test   %eax,%eax
    6000:	75 8e                	jne    5f90 <main+0x4ed0>
    6002:	f3 0f 10 05 7e 60 00 	movss  0x607e(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6009:	00 
    600a:	48 8d 35 ef 49 00 00 	lea    0x49ef(%rip),%rsi        # aa00 <store512wrapper>
    6011:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    6016:	e8 25 4a 00 00       	callq  aa40 <measureFunction>
    601b:	bf 01 00 00 00       	mov    $0x1,%edi
    6020:	b8 01 00 00 00       	mov    $0x1,%eax
    6025:	48 8d 35 69 50 00 00 	lea    0x5069(%rip),%rsi        # b095 <_IO_stdin_used+0x95>
    602c:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    6030:	e8 4b b0 ff ff       	callq  1080 <__printf_chk@plt>
    6035:	41 83 fc 01          	cmp    $0x1,%r12d
    6039:	0f 8f 43 d6 ff ff    	jg     3682 <main+0x25c2>
    603f:	e9 02 c9 ff ff       	jmpq   2946 <main+0x1886>
    6044:	ba 07 00 00 00       	mov    $0x7,%edx
    6049:	48 8d 35 29 51 00 00 	lea    0x5129(%rip),%rsi        # b179 <_IO_stdin_used+0x179>
    6050:	4c 89 ef             	mov    %r13,%rdi
    6053:	e8 d8 af ff ff       	callq  1030 <strncmp@plt>
    6058:	85 c0                	test   %eax,%eax
    605a:	75 8e                	jne    5fea <main+0x4f2a>
    605c:	f3 0f 10 05 24 60 00 	movss  0x6024(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6063:	00 
    6064:	48 8d 35 45 49 00 00 	lea    0x4945(%rip),%rsi        # a9b0 <load512wrapper>
    606b:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    6070:	e8 cb 49 00 00       	callq  aa40 <measureFunction>
    6075:	bf 01 00 00 00       	mov    $0x1,%edi
    607a:	b8 01 00 00 00       	mov    $0x1,%eax
    607f:	48 8d 35 f2 4f 00 00 	lea    0x4ff2(%rip),%rsi        # b078 <_IO_stdin_used+0x78>
    6086:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    608a:	e8 f1 af ff ff       	callq  1080 <__printf_chk@plt>
    608f:	41 83 fc 01          	cmp    $0x1,%r12d
    6093:	0f 8f 96 d5 ff ff    	jg     362f <main+0x256f>
    6099:	e9 a8 c8 ff ff       	jmpq   2946 <main+0x1886>
    609e:	ba 0b 00 00 00       	mov    $0xb,%edx
    60a3:	48 8d 35 bf 50 00 00 	lea    0x50bf(%rip),%rsi        # b169 <_IO_stdin_used+0x169>
    60aa:	4c 89 ef             	mov    %r13,%rdi
    60ad:	e8 7e af ff ff       	callq  1030 <strncmp@plt>
    60b2:	85 c0                	test   %eax,%eax
    60b4:	75 8e                	jne    6044 <main+0x4f84>
    60b6:	48 8d 35 46 30 00 00 	lea    0x3046(%rip),%rsi        # 9103 <mixfma512add256>
    60bd:	f3 0f 10 05 c3 5f 00 	movss  0x5fc3(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    60c4:	00 
    60c5:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    60ca:	48 c1 e7 09          	shl    $0x9,%rdi
    60ce:	e8 6d 49 00 00       	callq  aa40 <measureFunction>
    60d3:	bf 01 00 00 00       	mov    $0x1,%edi
    60d8:	b8 01 00 00 00       	mov    $0x1,%eax
    60dd:	48 8d 35 9c 58 00 00 	lea    0x589c(%rip),%rsi        # b980 <_IO_stdin_used+0x980>
    60e4:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    60e8:	e8 93 af ff ff       	callq  1080 <__printf_chk@plt>
    60ed:	41 83 fc 01          	cmp    $0x1,%r12d
    60f1:	0f 8f e5 d4 ff ff    	jg     35dc <main+0x251c>
    60f7:	e9 4a c8 ff ff       	jmpq   2946 <main+0x1886>
    60fc:	ba 0b 00 00 00       	mov    $0xb,%edx
    6101:	48 8d 35 54 50 00 00 	lea    0x5054(%rip),%rsi        # b15c <_IO_stdin_used+0x15c>
    6108:	4c 89 ef             	mov    %r13,%rdi
    610b:	e8 20 af ff ff       	callq  1030 <strncmp@plt>
    6110:	85 c0                	test   %eax,%eax
    6112:	75 8a                	jne    609e <main+0x4fde>
    6114:	48 8d 35 07 2f 00 00 	lea    0x2f07(%rip),%rsi        # 9022 <mixfmaadd512>
    611b:	f3 0f 10 05 65 5f 00 	movss  0x5f65(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6122:	00 
    6123:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    6128:	48 c1 e7 09          	shl    $0x9,%rdi
    612c:	e8 0f 49 00 00       	callq  aa40 <measureFunction>
    6131:	bf 01 00 00 00       	mov    $0x1,%edi
    6136:	b8 01 00 00 00       	mov    $0x1,%eax
    613b:	48 8d 35 16 58 00 00 	lea    0x5816(%rip),%rsi        # b958 <_IO_stdin_used+0x958>
    6142:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    6146:	e8 35 af ff ff       	callq  1080 <__printf_chk@plt>
    614b:	41 83 fc 01          	cmp    $0x1,%r12d
    614f:	0f 8f 30 d4 ff ff    	jg     3585 <main+0x24c5>
    6155:	e9 ec c7 ff ff       	jmpq   2946 <main+0x1886>
    615a:	ba 0d 00 00 00       	mov    $0xd,%edx
    615f:	48 8d 35 e7 4f 00 00 	lea    0x4fe7(%rip),%rsi        # b14d <_IO_stdin_used+0x14d>
    6166:	4c 89 ef             	mov    %r13,%rdi
    6169:	e8 c2 ae ff ff       	callq  1030 <strncmp@plt>
    616e:	85 c0                	test   %eax,%eax
    6170:	75 8a                	jne    60fc <main+0x503c>
    6172:	48 8d 35 7e 1f 00 00 	lea    0x1f7e(%rip),%rsi        # 80f7 <latmuldq512int>
    6179:	f3 0f 10 05 07 5f 00 	movss  0x5f07(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6180:	00 
    6181:	4c 89 f7             	mov    %r14,%rdi
    6184:	e8 b7 48 00 00       	callq  aa40 <measureFunction>
    6189:	bf 01 00 00 00       	mov    $0x1,%edi
    618e:	b8 01 00 00 00       	mov    $0x1,%eax
    6193:	f3 0f 10 0d ed 5e 00 	movss  0x5eed(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>
    619a:	00 
    619b:	48 8d 35 7e 57 00 00 	lea    0x577e(%rip),%rsi        # b920 <_IO_stdin_used+0x920>
    61a2:	f3 0f 5e c8          	divss  %xmm0,%xmm1
    61a6:	66 0f ef c0          	pxor   %xmm0,%xmm0
    61aa:	f3 0f 5a c1          	cvtss2sd %xmm1,%xmm0
    61ae:	e8 cd ae ff ff       	callq  1080 <__printf_chk@plt>
    61b3:	41 83 fc 01          	cmp    $0x1,%r12d
    61b7:	0f 8f 71 d3 ff ff    	jg     352e <main+0x246e>
    61bd:	e9 84 c7 ff ff       	jmpq   2946 <main+0x1886>
    61c2:	ba 0c 00 00 00       	mov    $0xc,%edx
    61c7:	48 8d 35 72 4f 00 00 	lea    0x4f72(%rip),%rsi        # b140 <_IO_stdin_used+0x140>
    61ce:	4c 89 ef             	mov    %r13,%rdi
    61d1:	e8 5a ae ff ff       	callq  1030 <strncmp@plt>
    61d6:	85 c0                	test   %eax,%eax
    61d8:	75 80                	jne    615a <main+0x509a>
    61da:	48 8d 35 44 1e 00 00 	lea    0x1e44(%rip),%rsi        # 8025 <latmul512int>
    61e1:	f3 0f 10 05 9f 5e 00 	movss  0x5e9f(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    61e8:	00 
    61e9:	4c 89 f7             	mov    %r14,%rdi
    61ec:	e8 4f 48 00 00       	callq  aa40 <measureFunction>
    61f1:	bf 01 00 00 00       	mov    $0x1,%edi
    61f6:	b8 01 00 00 00       	mov    $0x1,%eax
    61fb:	f3 0f 10 3d 85 5e 00 	movss  0x5e85(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>
    6202:	00 
    6203:	48 8d 35 e6 56 00 00 	lea    0x56e6(%rip),%rsi        # b8f0 <_IO_stdin_used+0x8f0>
    620a:	f3 0f 11 7c 24 0c    	movss  %xmm7,0xc(%rsp)
    6210:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    6214:	66 0f ef c0          	pxor   %xmm0,%xmm0
    6218:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    621c:	e8 5f ae ff ff       	callq  1080 <__printf_chk@plt>
    6221:	41 83 fc 01          	cmp    $0x1,%r12d
    6225:	0f 8f a4 d2 ff ff    	jg     34cf <main+0x240f>
    622b:	e9 16 c7 ff ff       	jmpq   2946 <main+0x1886>
    6230:	ba 0d 00 00 00       	mov    $0xd,%edx
    6235:	48 8d 35 f6 4e 00 00 	lea    0x4ef6(%rip),%rsi        # b132 <_IO_stdin_used+0x132>
    623c:	4c 89 ef             	mov    %r13,%rdi
    623f:	e8 ec ad ff ff       	callq  1030 <strncmp@plt>
    6244:	85 c0                	test   %eax,%eax
    6246:	0f 85 76 ff ff ff    	jne    61c2 <main+0x5102>
    624c:	48 8d 35 76 1f 00 00 	lea    0x1f76(%rip),%rsi        # 81c9 <latmulq512int>
    6253:	f3 0f 10 05 2d 5e 00 	movss  0x5e2d(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    625a:	00 
    625b:	4c 89 f7             	mov    %r14,%rdi
    625e:	e8 dd 47 00 00       	callq  aa40 <measureFunction>
    6263:	bf 01 00 00 00       	mov    $0x1,%edi
    6268:	b8 01 00 00 00       	mov    $0x1,%eax
    626d:	f3 0f 10 3d 13 5e 00 	movss  0x5e13(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>
    6274:	00 
    6275:	48 8d 35 44 56 00 00 	lea    0x5644(%rip),%rsi        # b8c0 <_IO_stdin_used+0x8c0>
    627c:	f3 0f 11 7c 24 0c    	movss  %xmm7,0xc(%rsp)
    6282:	f3 0f 5e f8          	divss  %xmm0,%xmm7
    6286:	66 0f ef c0          	pxor   %xmm0,%xmm0
    628a:	f3 0f 5a c7          	cvtss2sd %xmm7,%xmm0
    628e:	e8 ed ad ff ff       	callq  1080 <__printf_chk@plt>
    6293:	41 83 fc 01          	cmp    $0x1,%r12d
    6297:	0f 8f d3 d1 ff ff    	jg     3470 <main+0x23b0>
    629d:	e9 a4 c6 ff ff       	jmpq   2946 <main+0x1886>
    62a2:	ba 09 00 00 00       	mov    $0x9,%edx
    62a7:	48 8d 35 a2 4e 00 00 	lea    0x4ea2(%rip),%rsi        # b150 <_IO_stdin_used+0x150>
    62ae:	4c 89 ef             	mov    %r13,%rdi
    62b1:	e8 7a ad ff ff       	callq  1030 <strncmp@plt>
    62b6:	85 c0                	test   %eax,%eax
    62b8:	0f 85 72 ff ff ff    	jne    6230 <main+0x5170>
    62be:	48 8d 35 0a 17 00 00 	lea    0x170a(%rip),%rsi        # 79cf <muldq512int>
    62c5:	f3 0f 10 05 bb 5d 00 	movss  0x5dbb(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    62cc:	00 
    62cd:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    62d2:	e8 69 47 00 00       	callq  aa40 <measureFunction>
    62d7:	bf 01 00 00 00       	mov    $0x1,%edi
    62dc:	b8 01 00 00 00       	mov    $0x1,%eax
    62e1:	48 8d 35 a8 55 00 00 	lea    0x55a8(%rip),%rsi        # b890 <_IO_stdin_used+0x890>
    62e8:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    62ec:	e8 8f ad ff ff       	callq  1080 <__printf_chk@plt>
    62f1:	41 83 fc 01          	cmp    $0x1,%r12d
    62f5:	0f 8f 0e d1 ff ff    	jg     3409 <main+0x2349>
    62fb:	e9 46 c6 ff ff       	jmpq   2946 <main+0x1886>
    6300:	ba 09 00 00 00       	mov    $0x9,%edx
    6305:	48 8d 35 37 4e 00 00 	lea    0x4e37(%rip),%rsi        # b143 <_IO_stdin_used+0x143>
    630c:	4c 89 ef             	mov    %r13,%rdi
    630f:	e8 1c ad ff ff       	callq  1030 <strncmp@plt>
    6314:	85 c0                	test   %eax,%eax
    6316:	75 8a                	jne    62a2 <main+0x51e2>
    6318:	48 8d 35 f2 15 00 00 	lea    0x15f2(%rip),%rsi        # 7911 <mul512int>
    631f:	f3 0f 10 05 61 5d 00 	movss  0x5d61(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6326:	00 
    6327:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    632c:	e8 0f 47 00 00       	callq  aa40 <measureFunction>
    6331:	bf 01 00 00 00       	mov    $0x1,%edi
    6336:	b8 01 00 00 00       	mov    $0x1,%eax
    633b:	48 8d 35 26 55 00 00 	lea    0x5526(%rip),%rsi        # b868 <_IO_stdin_used+0x868>
    6342:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    6346:	e8 35 ad ff ff       	callq  1080 <__printf_chk@plt>
    634b:	41 83 fc 01          	cmp    $0x1,%r12d
    634f:	0f 8f 61 d0 ff ff    	jg     33b6 <main+0x22f6>
    6355:	e9 ec c5 ff ff       	jmpq   2946 <main+0x1886>
    635a:	ba 0c 00 00 00       	mov    $0xc,%edx
    635f:	48 8d 35 bf 4d 00 00 	lea    0x4dbf(%rip),%rsi        # b125 <_IO_stdin_used+0x125>
    6366:	4c 89 ef             	mov    %r13,%rdi
    6369:	e8 c2 ac ff ff       	callq  1030 <strncmp@plt>
    636e:	85 c0                	test   %eax,%eax
    6370:	75 8e                	jne    6300 <main+0x5240>
    6372:	48 8d 35 37 1b 00 00 	lea    0x1b37(%rip),%rsi        # 7eb0 <latadd256int>
    6379:	f3 0f 10 05 07 5d 00 	movss  0x5d07(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6380:	00 
    6381:	4c 89 f7             	mov    %r14,%rdi
    6384:	e8 b7 46 00 00       	callq  aa40 <measureFunction>
    6389:	bf 01 00 00 00       	mov    $0x1,%edi
    638e:	b8 01 00 00 00       	mov    $0x1,%eax
    6393:	f3 0f 10 0d ed 5c 00 	movss  0x5ced(%rip),%xmm1        # c088 <_IO_stdin_used+0x1088>
    639a:	00 
    639b:	48 8d 35 9e 54 00 00 	lea    0x549e(%rip),%rsi        # b840 <_IO_stdin_used+0x840>
    63a2:	f3 0f 5e c8          	divss  %xmm0,%xmm1
    63a6:	66 0f ef c0          	pxor   %xmm0,%xmm0
    63aa:	f3 0f 5a c1          	cvtss2sd %xmm1,%xmm0
    63ae:	e8 cd ac ff ff       	callq  1080 <__printf_chk@plt>
    63b3:	41 83 fc 01          	cmp    $0x1,%r12d
    63b7:	0f 8f a6 cf ff ff    	jg     3363 <main+0x22a3>
    63bd:	e9 84 c5 ff ff       	jmpq   2946 <main+0x1886>
    63c2:	ba 09 00 00 00       	mov    $0x9,%edx
    63c7:	48 8d 35 5a 4d 00 00 	lea    0x4d5a(%rip),%rsi        # b128 <_IO_stdin_used+0x128>
    63ce:	4c 89 ef             	mov    %r13,%rdi
    63d1:	e8 5a ac ff ff       	callq  1030 <strncmp@plt>
    63d6:	85 c0                	test   %eax,%eax
    63d8:	75 80                	jne    635a <main+0x529a>
    63da:	48 8d 35 ac 16 00 00 	lea    0x16ac(%rip),%rsi        # 7a8d <add512int>
    63e1:	f3 0f 10 05 9f 5c 00 	movss  0x5c9f(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    63e8:	00 
    63e9:	bf 00 2f 68 59       	mov    $0x59682f00,%edi
    63ee:	e8 4d 46 00 00       	callq  aa40 <measureFunction>
    63f3:	bf 01 00 00 00       	mov    $0x1,%edi
    63f8:	b8 01 00 00 00       	mov    $0x1,%eax
    63fd:	48 8d 35 1c 54 00 00 	lea    0x541c(%rip),%rsi        # b820 <_IO_stdin_used+0x820>
    6404:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    6408:	e8 73 ac ff ff       	callq  1080 <__printf_chk@plt>
    640d:	41 83 fc 01          	cmp    $0x1,%r12d
    6411:	0f 8f eb ce ff ff    	jg     3302 <main+0x2242>
    6417:	e9 2a c5 ff ff       	jmpq   2946 <main+0x1886>
    641c:	ba 0d 00 00 00       	mov    $0xd,%edx
    6421:	48 8d 35 ef 4c 00 00 	lea    0x4cef(%rip),%rsi        # b117 <_IO_stdin_used+0x117>
    6428:	4c 89 ef             	mov    %r13,%rdi
    642b:	e8 00 ac ff ff       	callq  1030 <strncmp@plt>
    6430:	85 c0                	test   %eax,%eax
    6432:	75 8e                	jne    63c2 <main+0x5302>
    6434:	48 8d 35 8e 30 00 00 	lea    0x308e(%rip),%rsi        # 94c9 <nemesfpu512mix21>
    643b:	f3 0f 10 05 45 5c 00 	movss  0x5c45(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    6442:	00 
    6443:	bf 05 7a d7 03       	mov    $0x3d77a05,%edi
    6448:	48 c1 e7 09          	shl    $0x9,%rdi
    644c:	e8 ef 45 00 00       	callq  aa40 <measureFunction>
    6451:	bf 01 00 00 00       	mov    $0x1,%edi
    6456:	b8 01 00 00 00       	mov    $0x1,%eax
    645b:	48 8d 35 8e 53 00 00 	lea    0x538e(%rip),%rsi        # b7f0 <_IO_stdin_used+0x7f0>
    6462:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    6466:	e8 15 ac ff ff       	callq  1080 <__printf_chk@plt>
    646b:	41 83 fc 01          	cmp    $0x1,%r12d
    646f:	0f 8f 3a ce ff ff    	jg     32af <main+0x21ef>
    6475:	e9 cc c4 ff ff       	jmpq   2946 <main+0x1886>
    647a:	4c 89 f0             	mov    %r14,%rax
    647d:	66 0f ef c9          	pxor   %xmm1,%xmm1
    6481:	48 d1 e8             	shr    %rax
    6484:	f3 48 0f 2a c8       	cvtsi2ss %rax,%xmm1
    6489:	f3 0f 58 c9          	addss  %xmm1,%xmm1
    648d:	e9 24 f5 ff ff       	jmpq   59b6 <main+0x48f6>
    6492:	48 89 c8             	mov    %rcx,%rax
    6495:	83 e1 01             	and    $0x1,%ecx
    6498:	66 0f ef c0          	pxor   %xmm0,%xmm0
    649c:	48 d1 e8             	shr    %rax
    649f:	48 09 c8             	or     %rcx,%rax
    64a2:	f3 48 0f 2a c0       	cvtsi2ss %rax,%xmm0
    64a7:	f3 0f 58 c0          	addss  %xmm0,%xmm0
    64ab:	e9 e8 f4 ff ff       	jmpq   5998 <main+0x48d8>
    64b0:	41 bd 01 00 00 00    	mov    $0x1,%r13d
    64b6:	e9 80 f4 ff ff       	jmpq   593b <main+0x487b>
    64bb:	48 8d 35 b6 08 00 00 	lea    0x8b6(%rip),%rsi        # 6d78 <noptest1b>
    64c2:	f3 0f 10 05 be 5b 00 	movss  0x5bbe(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    64c9:	00 
    64ca:	4c 89 f7             	mov    %r14,%rdi
    64cd:	e8 6e 45 00 00       	callq  aa40 <measureFunction>
    64d2:	bf 01 00 00 00       	mov    $0x1,%edi
    64d7:	b8 01 00 00 00       	mov    $0x1,%eax
    64dc:	48 8d 35 69 52 00 00 	lea    0x5269(%rip),%rsi        # b74c <_IO_stdin_used+0x74c>
    64e3:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    64e7:	e8 94 ab ff ff       	callq  1080 <__printf_chk@plt>
    64ec:	41 83 fc 01          	cmp    $0x1,%r12d
    64f0:	0f 8e 50 c4 ff ff    	jle    2946 <main+0x1886>
    64f6:	f3 0f 10 3d 8a 5b 00 	movss  0x5b8a(%rip),%xmm7        # c088 <_IO_stdin_used+0x1088>
    64fd:	00 
    64fe:	f3 0f 11 7c 24 0c    	movss  %xmm7,0xc(%rsp)
    6504:	f3 0f 11 7c 24 08    	movss  %xmm7,0x8(%rsp)
    650a:	e9 18 ad ff ff       	jmpq   1227 <main+0x167>
    650f:	f3 0f 10 2d 71 5b 00 	movss  0x5b71(%rip),%xmm5        # c088 <_IO_stdin_used+0x1088>
    6516:	00 
    6517:	f3 0f 11 6c 24 0c    	movss  %xmm5,0xc(%rsp)
    651d:	f3 0f 11 6c 24 08    	movss  %xmm5,0x8(%rsp)
    6523:	e9 b4 ac ff ff       	jmpq   11dc <main+0x11c>
    6528:	e8 13 ab ff ff       	callq  1040 <__stack_chk_fail@plt>
    652d:	0f 1f 00             	nopl   (%rax)

0000000000006530 <get_available_features>:
    6530:	53                   	push   %rbx
    6531:	41 89 d1             	mov    %edx,%r9d
    6534:	f7 c7 00 00 00 08    	test   $0x8000000,%edi
    653a:	74 13                	je     654f <get_available_features+0x1f>
    653c:	31 c9                	xor    %ecx,%ecx
    653e:	0f 01 d0             	xgetbv 
    6541:	89 c2                	mov    %eax,%edx
    6543:	83 e2 06             	and    $0x6,%edx
    6546:	83 fa 06             	cmp    $0x6,%edx
    6549:	0f 84 4a 02 00 00    	je     6799 <get_available_features+0x269>
    654f:	45 31 db             	xor    %r11d,%r11d
    6552:	45 31 d2             	xor    %r10d,%r10d
    6555:	41 89 f0             	mov    %esi,%r8d
    6558:	41 c1 e8 0f          	shr    $0xf,%r8d
    655c:	41 83 e0 01          	and    $0x1,%r8d
    6560:	44 89 c0             	mov    %r8d,%eax
    6563:	83 c8 02             	or     $0x2,%eax
    6566:	f7 c6 00 00 80 00    	test   $0x800000,%esi
    656c:	44 0f 45 c0          	cmovne %eax,%r8d
    6570:	44 89 c0             	mov    %r8d,%eax
    6573:	83 c8 08             	or     $0x8,%eax
    6576:	f7 c6 00 00 00 02    	test   $0x2000000,%esi
    657c:	44 0f 45 c0          	cmovne %eax,%r8d
    6580:	44 89 c0             	mov    %r8d,%eax
    6583:	83 c8 10             	or     $0x10,%eax
    6586:	81 e6 00 00 00 04    	and    $0x4000000,%esi
    658c:	44 0f 45 c0          	cmovne %eax,%r8d
    6590:	44 89 c0             	mov    %r8d,%eax
    6593:	83 c8 04             	or     $0x4,%eax
    6596:	f7 c7 00 00 80 00    	test   $0x800000,%edi
    659c:	44 0f 45 c0          	cmovne %eax,%r8d
    65a0:	44 89 c0             	mov    %r8d,%eax
    65a3:	0d 00 00 04 00       	or     $0x40000,%eax
    65a8:	f7 c7 00 00 00 02    	test   $0x2000000,%edi
    65ae:	44 0f 45 c0          	cmovne %eax,%r8d
    65b2:	44 89 c0             	mov    %r8d,%eax
    65b5:	0d 00 00 08 00       	or     $0x80000,%eax
    65ba:	40 f6 c7 02          	test   $0x2,%dil
    65be:	44 0f 45 c0          	cmovne %eax,%r8d
    65c2:	44 89 c0             	mov    %r8d,%eax
    65c5:	83 c8 20             	or     $0x20,%eax
    65c8:	40 f6 c7 01          	test   $0x1,%dil
    65cc:	44 0f 45 c0          	cmovne %eax,%r8d
    65d0:	44 89 c0             	mov    %r8d,%eax
    65d3:	83 c8 40             	or     $0x40,%eax
    65d6:	f7 c7 00 02 00 00    	test   $0x200,%edi
    65dc:	44 0f 45 c0          	cmovne %eax,%r8d
    65e0:	44 89 c0             	mov    %r8d,%eax
    65e3:	0c 80                	or     $0x80,%al
    65e5:	f7 c7 00 00 08 00    	test   $0x80000,%edi
    65eb:	44 0f 45 c0          	cmovne %eax,%r8d
    65ef:	44 89 c0             	mov    %r8d,%eax
    65f2:	80 cc 01             	or     $0x1,%ah
    65f5:	f7 c7 00 00 10 00    	test   $0x100000,%edi
    65fb:	44 0f 45 c0          	cmovne %eax,%r8d
    65ff:	45 85 d2             	test   %r10d,%r10d
    6602:	74 1b                	je     661f <get_available_features+0xef>
    6604:	f7 c7 00 00 00 10    	test   $0x10000000,%edi
    660a:	74 07                	je     6613 <get_available_features+0xe3>
    660c:	41 81 c8 00 02 00 00 	or     $0x200,%r8d
    6613:	81 e7 00 10 00 00    	and    $0x1000,%edi
    6619:	0f 85 6e 01 00 00    	jne    678d <get_available_features+0x25d>
    661f:	31 f6                	xor    %esi,%esi
    6621:	41 83 f9 06          	cmp    $0x6,%r9d
    6625:	7e 50                	jle    6677 <get_available_features+0x147>
    6627:	b8 07 00 00 00       	mov    $0x7,%eax
    662c:	89 f1                	mov    %esi,%ecx
    662e:	0f a2                	cpuid  
    6630:	f6 c3 08             	test   $0x8,%bl
    6633:	74 07                	je     663c <get_available_features+0x10c>
    6635:	41 81 c8 00 00 01 00 	or     $0x10000,%r8d
    663c:	31 f6                	xor    %esi,%esi
    663e:	45 85 d2             	test   %r10d,%r10d
    6641:	74 1b                	je     665e <get_available_features+0x12e>
    6643:	f6 c3 20             	test   $0x20,%bl
    6646:	74 07                	je     664f <get_available_features+0x11f>
    6648:	41 81 c8 00 04 00 00 	or     $0x400,%r8d
    664f:	89 ce                	mov    %ecx,%esi
    6651:	81 e6 00 04 00 00    	and    $0x400,%esi
    6657:	74 05                	je     665e <get_available_features+0x12e>
    6659:	be 02 00 00 00       	mov    $0x2,%esi
    665e:	f6 c7 01             	test   $0x1,%bh
    6661:	74 07                	je     666a <get_available_features+0x13a>
    6663:	41 81 c8 00 00 02 00 	or     $0x20000,%r8d
    666a:	f6 c5 01             	test   $0x1,%ch
    666d:	74 03                	je     6672 <get_available_features+0x142>
    666f:	83 ce 01             	or     $0x1,%esi
    6672:	45 85 db             	test   %r11d,%r11d
    6675:	75 50                	jne    66c7 <get_available_features+0x197>
    6677:	b8 00 00 00 80       	mov    $0x80000000,%eax
    667c:	0f a2                	cpuid  
    667e:	3d 00 00 00 80       	cmp    $0x80000000,%eax
    6683:	76 33                	jbe    66b8 <get_available_features+0x188>
    6685:	b8 01 00 00 80       	mov    $0x80000001,%eax
    668a:	0f a2                	cpuid  
    668c:	f6 c1 40             	test   $0x40,%cl
    668f:	74 07                	je     6698 <get_available_features+0x168>
    6691:	41 81 c8 00 08 00 00 	or     $0x800,%r8d
    6698:	45 85 d2             	test   %r10d,%r10d
    669b:	74 1b                	je     66b8 <get_available_features+0x188>
    669d:	f7 c1 00 00 01 00    	test   $0x10000,%ecx
    66a3:	74 07                	je     66ac <get_available_features+0x17c>
    66a5:	41 81 c8 00 10 00 00 	or     $0x1000,%r8d
    66ac:	80 e5 08             	and    $0x8,%ch
    66af:	74 07                	je     66b8 <get_available_features+0x188>
    66b1:	41 81 c8 00 20 00 00 	or     $0x2000,%r8d
    66b8:	44 89 05 3d 7a 00 00 	mov    %r8d,0x7a3d(%rip)        # e0fc <__cpu_model+0xc>
    66bf:	5b                   	pop    %rbx
    66c0:	89 35 42 7a 00 00    	mov    %esi,0x7a42(%rip)        # e108 <__cpu_features2>
    66c6:	c3                   	retq   
    66c7:	f7 c3 00 00 01 00    	test   $0x10000,%ebx
    66cd:	74 07                	je     66d6 <get_available_features+0x1a6>
    66cf:	41 81 c8 00 80 00 00 	or     $0x8000,%r8d
    66d6:	85 db                	test   %ebx,%ebx
    66d8:	0f 88 d7 00 00 00    	js     67b5 <get_available_features+0x285>
    66de:	f7 c3 00 00 00 40    	test   $0x40000000,%ebx
    66e4:	74 07                	je     66ed <get_available_features+0x1bd>
    66e6:	41 81 c8 00 00 20 00 	or     $0x200000,%r8d
    66ed:	f7 c3 00 00 02 00    	test   $0x20000,%ebx
    66f3:	74 07                	je     66fc <get_available_features+0x1cc>
    66f5:	41 81 c8 00 00 40 00 	or     $0x400000,%r8d
    66fc:	f7 c3 00 00 00 10    	test   $0x10000000,%ebx
    6702:	74 07                	je     670b <get_available_features+0x1db>
    6704:	41 81 c8 00 00 80 00 	or     $0x800000,%r8d
    670b:	f7 c3 00 00 00 04    	test   $0x4000000,%ebx
    6711:	74 07                	je     671a <get_available_features+0x1ea>
    6713:	41 81 c8 00 00 00 02 	or     $0x2000000,%r8d
    671a:	f7 c3 00 00 00 08    	test   $0x8000000,%ebx
    6720:	74 07                	je     6729 <get_available_features+0x1f9>
    6722:	41 81 c8 00 00 00 01 	or     $0x1000000,%r8d
    6729:	81 e3 00 00 20 00    	and    $0x200000,%ebx
    672f:	74 07                	je     6738 <get_available_features+0x208>
    6731:	41 81 c8 00 00 00 08 	or     $0x8000000,%r8d
    6738:	f6 c1 02             	test   $0x2,%cl
    673b:	74 07                	je     6744 <get_available_features+0x214>
    673d:	41 81 c8 00 00 00 04 	or     $0x4000000,%r8d
    6744:	f6 c1 40             	test   $0x40,%cl
    6747:	74 07                	je     6750 <get_available_features+0x220>
    6749:	41 81 c8 00 00 00 80 	or     $0x80000000,%r8d
    6750:	f6 c5 08             	test   $0x8,%ch
    6753:	74 03                	je     6758 <get_available_features+0x228>
    6755:	83 ce 04             	or     $0x4,%esi
    6758:	f6 c5 10             	test   $0x10,%ch
    675b:	74 03                	je     6760 <get_available_features+0x230>
    675d:	83 ce 08             	or     $0x8,%esi
    6760:	80 e5 40             	and    $0x40,%ch
    6763:	74 07                	je     676c <get_available_features+0x23c>
    6765:	41 81 c8 00 00 00 40 	or     $0x40000000,%r8d
    676c:	f6 c2 04             	test   $0x4,%dl
    676f:	74 07                	je     6778 <get_available_features+0x248>
    6771:	41 81 c8 00 00 00 10 	or     $0x10000000,%r8d
    6778:	80 e2 08             	and    $0x8,%dl
    677b:	0f 84 f6 fe ff ff    	je     6677 <get_available_features+0x147>
    6781:	41 81 c8 00 00 00 20 	or     $0x20000000,%r8d
    6788:	e9 ea fe ff ff       	jmpq   6677 <get_available_features+0x147>
    678d:	41 81 c8 00 40 00 00 	or     $0x4000,%r8d
    6794:	e9 86 fe ff ff       	jmpq   661f <get_available_features+0xef>
    6799:	25 e6 00 00 00       	and    $0xe6,%eax
    679e:	45 31 db             	xor    %r11d,%r11d
    67a1:	41 ba 01 00 00 00    	mov    $0x1,%r10d
    67a7:	3d e6 00 00 00       	cmp    $0xe6,%eax
    67ac:	41 0f 94 c3          	sete   %r11b
    67b0:	e9 a0 fd ff ff       	jmpq   6555 <get_available_features+0x25>
    67b5:	41 81 c8 00 00 10 00 	or     $0x100000,%r8d
    67bc:	e9 1d ff ff ff       	jmpq   66de <get_available_features+0x1ae>
    67c1:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
    67c8:	00 00 00 00 
    67cc:	0f 1f 40 00          	nopl   0x0(%rax)

00000000000067d0 <__cpu_indicator_init>:
    67d0:	f3 0f 1e fa          	endbr64 
    67d4:	8b 05 16 79 00 00    	mov    0x7916(%rip),%eax        # e0f0 <__cpu_model>
    67da:	45 31 c9             	xor    %r9d,%r9d
    67dd:	85 c0                	test   %eax,%eax
    67df:	75 78                	jne    6859 <__cpu_indicator_init+0x89>
    67e1:	53                   	push   %rbx
    67e2:	44 89 c8             	mov    %r9d,%eax
    67e5:	0f a2                	cpuid  
    67e7:	85 c0                	test   %eax,%eax
    67e9:	0f 84 cc 00 00 00    	je     68bb <__cpu_indicator_init+0xeb>
    67ef:	44 89 c8             	mov    %r9d,%eax
    67f2:	0f a2                	cpuid  
    67f4:	41 89 da             	mov    %ebx,%r10d
    67f7:	41 89 c0             	mov    %eax,%r8d
    67fa:	85 c0                	test   %eax,%eax
    67fc:	0f 8e b9 00 00 00    	jle    68bb <__cpu_indicator_init+0xeb>
    6802:	44 89 c8             	mov    %r9d,%eax
    6805:	0f a2                	cpuid  
    6807:	85 c0                	test   %eax,%eax
    6809:	0f 84 ac 00 00 00    	je     68bb <__cpu_indicator_init+0xeb>
    680f:	b8 01 00 00 00       	mov    $0x1,%eax
    6814:	0f a2                	cpuid  
    6816:	89 d6                	mov    %edx,%esi
    6818:	89 cf                	mov    %ecx,%edi
    681a:	89 c2                	mov    %eax,%edx
    681c:	89 c1                	mov    %eax,%ecx
    681e:	c1 ea 04             	shr    $0x4,%edx
    6821:	41 89 c3             	mov    %eax,%r11d
    6824:	c1 e9 08             	shr    $0x8,%ecx
    6827:	41 c1 eb 0c          	shr    $0xc,%r11d
    682b:	83 e2 0f             	and    $0xf,%edx
    682e:	83 e1 0f             	and    $0xf,%ecx
    6831:	41 81 e3 f0 00 00 00 	and    $0xf0,%r11d
    6838:	41 81 fa 47 65 6e 75 	cmp    $0x756e6547,%r10d
    683f:	74 1c                	je     685d <__cpu_indicator_init+0x8d>
    6841:	41 81 fa 41 75 74 68 	cmp    $0x68747541,%r10d
    6848:	74 33                	je     687d <__cpu_indicator_init+0xad>
    684a:	c7 05 9c 78 00 00 03 	movl   $0x3,0x789c(%rip)        # e0f0 <__cpu_model>
    6851:	00 00 00 
    6854:	44 89 c8             	mov    %r9d,%eax
    6857:	5b                   	pop    %rbx
    6858:	c3                   	retq   
    6859:	44 89 c8             	mov    %r9d,%eax
    685c:	c3                   	retq   
    685d:	83 f9 06             	cmp    $0x6,%ecx
    6860:	0f 84 fe 00 00 00    	je     6964 <__cpu_indicator_init+0x194>
    6866:	44 89 c2             	mov    %r8d,%edx
    6869:	e8 c2 fc ff ff       	callq  6530 <get_available_features>
    686e:	45 31 c9             	xor    %r9d,%r9d
    6871:	c7 05 75 78 00 00 01 	movl   $0x1,0x7875(%rip)        # e0f0 <__cpu_model>
    6878:	00 00 00 
    687b:	eb d7                	jmp    6854 <__cpu_indicator_init+0x84>
    687d:	83 f9 0f             	cmp    $0xf,%ecx
    6880:	74 17                	je     6899 <__cpu_indicator_init+0xc9>
    6882:	44 89 c2             	mov    %r8d,%edx
    6885:	e8 a6 fc ff ff       	callq  6530 <get_available_features>
    688a:	45 31 c9             	xor    %r9d,%r9d
    688d:	c7 05 59 78 00 00 02 	movl   $0x2,0x7859(%rip)        # e0f0 <__cpu_model>
    6894:	00 00 00 
    6897:	eb bb                	jmp    6854 <__cpu_indicator_init+0x84>
    6899:	c1 e8 14             	shr    $0x14,%eax
    689c:	44 09 da             	or     %r11d,%edx
    689f:	0f b6 c0             	movzbl %al,%eax
    68a2:	83 e8 01             	sub    $0x1,%eax
    68a5:	83 f8 07             	cmp    $0x7,%eax
    68a8:	77 d8                	ja     6882 <__cpu_indicator_init+0xb2>
    68aa:	48 8d 0d 0f 58 00 00 	lea    0x580f(%rip),%rcx        # c0c0 <_IO_stdin_used+0x10c0>
    68b1:	48 63 04 81          	movslq (%rcx,%rax,4),%rax
    68b5:	48 01 c8             	add    %rcx,%rax
    68b8:	3e ff e0             	notrack jmpq *%rax
    68bb:	c7 05 2b 78 00 00 03 	movl   $0x3,0x782b(%rip)        # e0f0 <__cpu_model>
    68c2:	00 00 00 
    68c5:	41 83 c9 ff          	or     $0xffffffff,%r9d
    68c9:	eb 89                	jmp    6854 <__cpu_indicator_init+0x84>
    68cb:	c7 05 1f 78 00 00 0a 	movl   $0xa,0x781f(%rip)        # e0f4 <__cpu_model+0x4>
    68d2:	00 00 00 
    68d5:	83 fa 1f             	cmp    $0x1f,%edx
    68d8:	0f 87 05 02 00 00    	ja     6ae3 <__cpu_indicator_init+0x313>
    68de:	c7 05 10 78 00 00 0b 	movl   $0xb,0x7810(%rip)        # e0f8 <__cpu_model+0x8>
    68e5:	00 00 00 
    68e8:	eb 98                	jmp    6882 <__cpu_indicator_init+0xb2>
    68ea:	c7 05 00 78 00 00 09 	movl   $0x9,0x7800(%rip)        # e0f4 <__cpu_model+0x4>
    68f1:	00 00 00 
    68f4:	eb 8c                	jmp    6882 <__cpu_indicator_init+0xb2>
    68f6:	c7 05 f4 77 00 00 05 	movl   $0x5,0x77f4(%rip)        # e0f4 <__cpu_model+0x4>
    68fd:	00 00 00 
    6900:	83 fa 02             	cmp    $0x2,%edx
    6903:	0f 84 cb 01 00 00    	je     6ad4 <__cpu_indicator_init+0x304>
    6909:	83 fa 0f             	cmp    $0xf,%edx
    690c:	0f 87 e9 01 00 00    	ja     6afb <__cpu_indicator_init+0x32b>
    6912:	c7 05 dc 77 00 00 07 	movl   $0x7,0x77dc(%rip)        # e0f8 <__cpu_model+0x8>
    6919:	00 00 00 
    691c:	e9 61 ff ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6921:	c7 05 c9 77 00 00 08 	movl   $0x8,0x77c9(%rip)        # e0f4 <__cpu_model+0x4>
    6928:	00 00 00 
    692b:	e9 52 ff ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6930:	c7 05 ba 77 00 00 04 	movl   $0x4,0x77ba(%rip)        # e0f4 <__cpu_model+0x4>
    6937:	00 00 00 
    693a:	83 fa 04             	cmp    $0x4,%edx
    693d:	0f 84 e0 01 00 00    	je     6b23 <__cpu_indicator_init+0x353>
    6943:	83 fa 08             	cmp    $0x8,%edx
    6946:	0f 84 c8 01 00 00    	je     6b14 <__cpu_indicator_init+0x344>
    694c:	83 fa 02             	cmp    $0x2,%edx
    694f:	0f 85 2d ff ff ff    	jne    6882 <__cpu_indicator_init+0xb2>
    6955:	c7 05 99 77 00 00 04 	movl   $0x4,0x7799(%rip)        # e0f8 <__cpu_model+0x8>
    695c:	00 00 00 
    695f:	e9 1e ff ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6964:	89 d0                	mov    %edx,%eax
    6966:	44 09 d8             	or     %r11d,%eax
    6969:	84 db                	test   %bl,%bl
    696b:	0f 85 f5 fe ff ff    	jne    6866 <__cpu_indicator_init+0x96>
    6971:	83 e8 0f             	sub    $0xf,%eax
    6974:	3d 8f 00 00 00       	cmp    $0x8f,%eax
    6979:	0f 87 e7 fe ff ff    	ja     6866 <__cpu_indicator_init+0x96>
    697f:	48 8d 15 5a 57 00 00 	lea    0x575a(%rip),%rdx        # c0e0 <_IO_stdin_used+0x10e0>
    6986:	48 63 04 82          	movslq (%rdx,%rax,4),%rax
    698a:	48 01 d0             	add    %rdx,%rax
    698d:	3e ff e0             	notrack jmpq *%rax
    6990:	c7 05 5a 77 00 00 0b 	movl   $0xb,0x775a(%rip)        # e0f4 <__cpu_model+0x4>
    6997:	00 00 00 
    699a:	e9 c7 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    699f:	c7 05 4b 77 00 00 0d 	movl   $0xd,0x774b(%rip)        # e0f4 <__cpu_model+0x4>
    69a6:	00 00 00 
    69a9:	e9 b8 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    69ae:	48 b8 03 00 00 00 11 	movabs $0x1100000003,%rax
    69b5:	00 00 00 
    69b8:	48 89 05 35 77 00 00 	mov    %rax,0x7735(%rip)        # e0f4 <__cpu_model+0x4>
    69bf:	e9 a2 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    69c4:	c7 05 26 77 00 00 0c 	movl   $0xc,0x7726(%rip)        # e0f4 <__cpu_model+0x4>
    69cb:	00 00 00 
    69ce:	e9 93 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    69d3:	c7 05 17 77 00 00 07 	movl   $0x7,0x7717(%rip)        # e0f4 <__cpu_model+0x4>
    69da:	00 00 00 
    69dd:	e9 84 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    69e2:	c7 05 08 77 00 00 03 	movl   $0x3,0x7708(%rip)        # e0f4 <__cpu_model+0x4>
    69e9:	00 00 00 
    69ec:	b8 07 00 00 00       	mov    $0x7,%eax
    69f1:	31 c9                	xor    %ecx,%ecx
    69f3:	0f a2                	cpuid  
    69f5:	80 e5 08             	and    $0x8,%ch
    69f8:	0f 84 4c 01 00 00    	je     6b4a <__cpu_indicator_init+0x37a>
    69fe:	c7 05 f0 76 00 00 15 	movl   $0x15,0x76f0(%rip)        # e0f8 <__cpu_model+0x8>
    6a05:	00 00 00 
    6a08:	e9 59 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a0d:	48 b8 03 00 00 00 0f 	movabs $0xf00000003,%rax
    6a14:	00 00 00 
    6a17:	48 89 05 d6 76 00 00 	mov    %rax,0x76d6(%rip)        # e0f4 <__cpu_model+0x4>
    6a1e:	e9 43 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a23:	48 b8 03 00 00 00 0e 	movabs $0xe00000003,%rax
    6a2a:	00 00 00 
    6a2d:	48 89 05 c0 76 00 00 	mov    %rax,0x76c0(%rip)        # e0f4 <__cpu_model+0x4>
    6a34:	e9 2d fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a39:	48 b8 03 00 00 00 0d 	movabs $0xd00000003,%rax
    6a40:	00 00 00 
    6a43:	48 89 05 aa 76 00 00 	mov    %rax,0x76aa(%rip)        # e0f4 <__cpu_model+0x4>
    6a4a:	e9 17 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a4f:	48 b8 03 00 00 00 0c 	movabs $0xc00000003,%rax
    6a56:	00 00 00 
    6a59:	48 89 05 94 76 00 00 	mov    %rax,0x7694(%rip)        # e0f4 <__cpu_model+0x4>
    6a60:	e9 01 fe ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a65:	c7 05 85 76 00 00 06 	movl   $0x6,0x7685(%rip)        # e0f4 <__cpu_model+0x4>
    6a6c:	00 00 00 
    6a6f:	e9 f2 fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a74:	48 b8 03 00 00 00 03 	movabs $0x300000003,%rax
    6a7b:	00 00 00 
    6a7e:	48 89 05 6f 76 00 00 	mov    %rax,0x766f(%rip)        # e0f4 <__cpu_model+0x4>
    6a85:	e9 dc fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6a8a:	48 b8 03 00 00 00 02 	movabs $0x200000003,%rax
    6a91:	00 00 00 
    6a94:	48 89 05 59 76 00 00 	mov    %rax,0x7659(%rip)        # e0f4 <__cpu_model+0x4>
    6a9b:	e9 c6 fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6aa0:	c7 05 4a 76 00 00 01 	movl   $0x1,0x764a(%rip)        # e0f4 <__cpu_model+0x4>
    6aa7:	00 00 00 
    6aaa:	e9 b7 fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6aaf:	48 b8 03 00 00 00 01 	movabs $0x100000003,%rax
    6ab6:	00 00 00 
    6ab9:	48 89 05 34 76 00 00 	mov    %rax,0x7634(%rip)        # e0f4 <__cpu_model+0x4>
    6ac0:	e9 a1 fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6ac5:	c7 05 25 76 00 00 02 	movl   $0x2,0x7625(%rip)        # e0f4 <__cpu_model+0x4>
    6acc:	00 00 00 
    6acf:	e9 92 fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6ad4:	c7 05 1a 76 00 00 08 	movl   $0x8,0x761a(%rip)        # e0f8 <__cpu_model+0x8>
    6adb:	00 00 00 
    6ade:	e9 9f fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6ae3:	83 fa 2f             	cmp    $0x2f,%edx
    6ae6:	0f 86 96 fd ff ff    	jbe    6882 <__cpu_indicator_init+0xb2>
    6aec:	c7 05 02 76 00 00 14 	movl   $0x14,0x7602(%rip)        # e0f8 <__cpu_model+0x8>
    6af3:	00 00 00 
    6af6:	e9 87 fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6afb:	83 fa 2f             	cmp    $0x2f,%edx
    6afe:	76 d4                	jbe    6ad4 <__cpu_indicator_init+0x304>
    6b00:	83 fa 4f             	cmp    $0x4f,%edx
    6b03:	77 2d                	ja     6b32 <__cpu_indicator_init+0x362>
    6b05:	c7 05 e9 75 00 00 09 	movl   $0x9,0x75e9(%rip)        # e0f8 <__cpu_model+0x8>
    6b0c:	00 00 00 
    6b0f:	e9 6e fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6b14:	c7 05 da 75 00 00 06 	movl   $0x6,0x75da(%rip)        # e0f8 <__cpu_model+0x8>
    6b1b:	00 00 00 
    6b1e:	e9 5f fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6b23:	c7 05 cb 75 00 00 05 	movl   $0x5,0x75cb(%rip)        # e0f8 <__cpu_model+0x8>
    6b2a:	00 00 00 
    6b2d:	e9 50 fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6b32:	83 fa 7f             	cmp    $0x7f,%edx
    6b35:	0f 87 47 fd ff ff    	ja     6882 <__cpu_indicator_init+0xb2>
    6b3b:	c7 05 b3 75 00 00 0a 	movl   $0xa,0x75b3(%rip)        # e0f8 <__cpu_model+0x8>
    6b42:	00 00 00 
    6b45:	e9 38 fd ff ff       	jmpq   6882 <__cpu_indicator_init+0xb2>
    6b4a:	c7 05 a4 75 00 00 10 	movl   $0x10,0x75a4(%rip)        # e0f8 <__cpu_model+0x8>
    6b51:	00 00 00 
    6b54:	e9 0d fd ff ff       	jmpq   6866 <__cpu_indicator_init+0x96>
    6b59:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)

0000000000006b60 <_start>:
    6b60:	f3 0f 1e fa          	endbr64 
    6b64:	31 ed                	xor    %ebp,%ebp
    6b66:	49 89 d1             	mov    %rdx,%r9
    6b69:	5e                   	pop    %rsi
    6b6a:	48 89 e2             	mov    %rsp,%rdx
    6b6d:	48 83 e4 f0          	and    $0xfffffffffffffff0,%rsp
    6b71:	50                   	push   %rax
    6b72:	54                   	push   %rsp
    6b73:	4c 8d 05 66 40 00 00 	lea    0x4066(%rip),%r8        # abe0 <__libc_csu_fini>
    6b7a:	48 8d 0d ef 3f 00 00 	lea    0x3fef(%rip),%rcx        # ab70 <__libc_csu_init>
    6b81:	48 8d 3d 38 a5 ff ff 	lea    -0x5ac8(%rip),%rdi        # 10c0 <main>
    6b88:	ff 15 52 74 00 00    	callq  *0x7452(%rip)        # dfe0 <__libc_start_main@GLIBC_2.2.5>
    6b8e:	f4                   	hlt    
    6b8f:	90                   	nop

0000000000006b90 <deregister_tm_clones>:
    6b90:	48 8d 3d 49 75 00 00 	lea    0x7549(%rip),%rdi        # e0e0 <stderr@@GLIBC_2.2.5>
    6b97:	48 8d 05 42 75 00 00 	lea    0x7542(%rip),%rax        # e0e0 <stderr@@GLIBC_2.2.5>
    6b9e:	48 39 f8             	cmp    %rdi,%rax
    6ba1:	74 15                	je     6bb8 <deregister_tm_clones+0x28>
    6ba3:	48 8b 05 2e 74 00 00 	mov    0x742e(%rip),%rax        # dfd8 <_ITM_deregisterTMCloneTable>
    6baa:	48 85 c0             	test   %rax,%rax
    6bad:	74 09                	je     6bb8 <deregister_tm_clones+0x28>
    6baf:	ff e0                	jmpq   *%rax
    6bb1:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)
    6bb8:	c3                   	retq   
    6bb9:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)

0000000000006bc0 <register_tm_clones>:
    6bc0:	48 8d 3d 19 75 00 00 	lea    0x7519(%rip),%rdi        # e0e0 <stderr@@GLIBC_2.2.5>
    6bc7:	48 8d 35 12 75 00 00 	lea    0x7512(%rip),%rsi        # e0e0 <stderr@@GLIBC_2.2.5>
    6bce:	48 29 fe             	sub    %rdi,%rsi
    6bd1:	48 89 f0             	mov    %rsi,%rax
    6bd4:	48 c1 ee 3f          	shr    $0x3f,%rsi
    6bd8:	48 c1 f8 03          	sar    $0x3,%rax
    6bdc:	48 01 c6             	add    %rax,%rsi
    6bdf:	48 d1 fe             	sar    %rsi
    6be2:	74 14                	je     6bf8 <register_tm_clones+0x38>
    6be4:	48 8b 05 05 74 00 00 	mov    0x7405(%rip),%rax        # dff0 <_ITM_registerTMCloneTable>
    6beb:	48 85 c0             	test   %rax,%rax
    6bee:	74 08                	je     6bf8 <register_tm_clones+0x38>
    6bf0:	ff e0                	jmpq   *%rax
    6bf2:	66 0f 1f 44 00 00    	nopw   0x0(%rax,%rax,1)
    6bf8:	c3                   	retq   
    6bf9:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)

0000000000006c00 <__do_global_dtors_aux>:
    6c00:	f3 0f 1e fa          	endbr64 
    6c04:	80 3d dd 74 00 00 00 	cmpb   $0x0,0x74dd(%rip)        # e0e8 <completed.8061>
    6c0b:	75 2b                	jne    6c38 <__do_global_dtors_aux+0x38>
    6c0d:	55                   	push   %rbp
    6c0e:	48 83 3d e2 73 00 00 	cmpq   $0x0,0x73e2(%rip)        # dff8 <__cxa_finalize@GLIBC_2.2.5>
    6c15:	00 
    6c16:	48 89 e5             	mov    %rsp,%rbp
    6c19:	74 0c                	je     6c27 <__do_global_dtors_aux+0x27>
    6c1b:	48 8b 3d e6 73 00 00 	mov    0x73e6(%rip),%rdi        # e008 <__dso_handle>
    6c22:	e8 89 a4 ff ff       	callq  10b0 <__cxa_finalize@plt>
    6c27:	e8 64 ff ff ff       	callq  6b90 <deregister_tm_clones>
    6c2c:	c6 05 b5 74 00 00 01 	movb   $0x1,0x74b5(%rip)        # e0e8 <completed.8061>
    6c33:	5d                   	pop    %rbp
    6c34:	c3                   	retq   
    6c35:	0f 1f 00             	nopl   (%rax)
    6c38:	c3                   	retq   
    6c39:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)

0000000000006c40 <frame_dummy>:
    6c40:	f3 0f 1e fa          	endbr64 
    6c44:	e9 77 ff ff ff       	jmpq   6bc0 <register_tm_clones>

0000000000006c49 <clktest>:
    6c49:	53                   	push   %rbx
    6c4a:	41 50                	push   %r8
    6c4c:	41 51                	push   %r9
    6c4e:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6c55:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6c5c:	48 31 db             	xor    %rbx,%rbx

0000000000006c5f <clktest_loop>:
    6c5f:	4c 01 c3             	add    %r8,%rbx
    6c62:	4c 01 c3             	add    %r8,%rbx
    6c65:	4c 01 c3             	add    %r8,%rbx
    6c68:	4c 01 c3             	add    %r8,%rbx
    6c6b:	4c 01 c3             	add    %r8,%rbx
    6c6e:	4c 01 c3             	add    %r8,%rbx
    6c71:	4c 01 c3             	add    %r8,%rbx
    6c74:	4c 01 c3             	add    %r8,%rbx
    6c77:	4c 01 c3             	add    %r8,%rbx
    6c7a:	4c 01 c3             	add    %r8,%rbx
    6c7d:	4c 01 c3             	add    %r8,%rbx
    6c80:	4c 01 c3             	add    %r8,%rbx
    6c83:	4c 01 c3             	add    %r8,%rbx
    6c86:	4c 01 c3             	add    %r8,%rbx
    6c89:	4c 01 c3             	add    %r8,%rbx
    6c8c:	4c 01 c3             	add    %r8,%rbx
    6c8f:	4c 01 c3             	add    %r8,%rbx
    6c92:	4c 01 c3             	add    %r8,%rbx
    6c95:	4c 01 c3             	add    %r8,%rbx
    6c98:	4c 01 c3             	add    %r8,%rbx
    6c9b:	4c 29 cf             	sub    %r9,%rdi
    6c9e:	75 bf                	jne    6c5f <clktest_loop>
    6ca0:	41 59                	pop    %r9
    6ca2:	41 58                	pop    %r8
    6ca4:	5b                   	pop    %rbx
    6ca5:	c3                   	retq   

0000000000006ca6 <clkmovtest>:
    6ca6:	53                   	push   %rbx
    6ca7:	41 50                	push   %r8
    6ca9:	41 51                	push   %r9
    6cab:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6cb2:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6cb9:	48 31 db             	xor    %rbx,%rbx

0000000000006cbc <clkmovtest_loop>:
    6cbc:	4c 01 c3             	add    %r8,%rbx
    6cbf:	49 89 d8             	mov    %rbx,%r8
    6cc2:	4c 01 c3             	add    %r8,%rbx
    6cc5:	49 89 d8             	mov    %rbx,%r8
    6cc8:	4c 01 c3             	add    %r8,%rbx
    6ccb:	49 89 d8             	mov    %rbx,%r8
    6cce:	4c 01 c3             	add    %r8,%rbx
    6cd1:	49 89 d8             	mov    %rbx,%r8
    6cd4:	4c 01 c3             	add    %r8,%rbx
    6cd7:	49 89 d8             	mov    %rbx,%r8
    6cda:	4c 01 c3             	add    %r8,%rbx
    6cdd:	49 89 d8             	mov    %rbx,%r8
    6ce0:	4c 01 c3             	add    %r8,%rbx
    6ce3:	49 89 d8             	mov    %rbx,%r8
    6ce6:	4c 01 c3             	add    %r8,%rbx
    6ce9:	49 89 d8             	mov    %rbx,%r8
    6cec:	4c 01 c3             	add    %r8,%rbx
    6cef:	49 89 d8             	mov    %rbx,%r8
    6cf2:	4c 01 c3             	add    %r8,%rbx
    6cf5:	49 89 d8             	mov    %rbx,%r8
    6cf8:	4c 01 c3             	add    %r8,%rbx
    6cfb:	49 89 d8             	mov    %rbx,%r8
    6cfe:	4c 01 c3             	add    %r8,%rbx
    6d01:	49 89 d8             	mov    %rbx,%r8
    6d04:	4c 01 c3             	add    %r8,%rbx
    6d07:	49 89 d8             	mov    %rbx,%r8
    6d0a:	4c 01 c3             	add    %r8,%rbx
    6d0d:	49 89 d8             	mov    %rbx,%r8
    6d10:	4c 01 c3             	add    %r8,%rbx
    6d13:	49 89 d8             	mov    %rbx,%r8
    6d16:	4c 01 c3             	add    %r8,%rbx
    6d19:	49 89 d8             	mov    %rbx,%r8
    6d1c:	4c 01 c3             	add    %r8,%rbx
    6d1f:	49 89 d8             	mov    %rbx,%r8
    6d22:	4c 01 c3             	add    %r8,%rbx
    6d25:	49 89 d8             	mov    %rbx,%r8
    6d28:	4c 01 c3             	add    %r8,%rbx
    6d2b:	49 89 d8             	mov    %rbx,%r8
    6d2e:	4c 01 c3             	add    %r8,%rbx
    6d31:	49 89 d8             	mov    %rbx,%r8
    6d34:	4c 29 cf             	sub    %r9,%rdi
    6d37:	75 83                	jne    6cbc <clkmovtest_loop>
    6d39:	41 59                	pop    %r9
    6d3b:	41 58                	pop    %r8
    6d3d:	5b                   	pop    %rbx
    6d3e:	c3                   	retq   

0000000000006d3f <noptest>:
    6d3f:	53                   	push   %rbx
    6d40:	41 51                	push   %r9
    6d42:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

0000000000006d49 <noptest_loop>:
    6d49:	66 90                	xchg   %ax,%ax
    6d4b:	66 90                	xchg   %ax,%ax
    6d4d:	66 90                	xchg   %ax,%ax
    6d4f:	66 90                	xchg   %ax,%ax
    6d51:	66 90                	xchg   %ax,%ax
    6d53:	66 90                	xchg   %ax,%ax
    6d55:	66 90                	xchg   %ax,%ax
    6d57:	66 90                	xchg   %ax,%ax
    6d59:	66 90                	xchg   %ax,%ax
    6d5b:	66 90                	xchg   %ax,%ax
    6d5d:	66 90                	xchg   %ax,%ax
    6d5f:	66 90                	xchg   %ax,%ax
    6d61:	66 90                	xchg   %ax,%ax
    6d63:	66 90                	xchg   %ax,%ax
    6d65:	66 90                	xchg   %ax,%ax
    6d67:	66 90                	xchg   %ax,%ax
    6d69:	66 90                	xchg   %ax,%ax
    6d6b:	66 90                	xchg   %ax,%ax
    6d6d:	66 90                	xchg   %ax,%ax
    6d6f:	4c 29 cf             	sub    %r9,%rdi
    6d72:	75 d5                	jne    6d49 <noptest_loop>
    6d74:	41 59                	pop    %r9
    6d76:	5b                   	pop    %rbx
    6d77:	c3                   	retq   

0000000000006d78 <noptest1b>:
    6d78:	53                   	push   %rbx
    6d79:	41 51                	push   %r9
    6d7b:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

0000000000006d82 <noptest1b_loop>:
    6d82:	90                   	nop
    6d83:	90                   	nop
    6d84:	90                   	nop
    6d85:	90                   	nop
    6d86:	90                   	nop
    6d87:	90                   	nop
    6d88:	90                   	nop
    6d89:	90                   	nop
    6d8a:	90                   	nop
    6d8b:	90                   	nop
    6d8c:	90                   	nop
    6d8d:	90                   	nop
    6d8e:	90                   	nop
    6d8f:	90                   	nop
    6d90:	90                   	nop
    6d91:	90                   	nop
    6d92:	90                   	nop
    6d93:	90                   	nop
    6d94:	90                   	nop
    6d95:	4c 29 cf             	sub    %r9,%rdi
    6d98:	75 e8                	jne    6d82 <noptest1b_loop>
    6d9a:	41 59                	pop    %r9
    6d9c:	5b                   	pop    %rbx
    6d9d:	c3                   	retq   

0000000000006d9e <addtest>:
    6d9e:	53                   	push   %rbx
    6d9f:	51                   	push   %rcx
    6da0:	41 50                	push   %r8
    6da2:	41 51                	push   %r9
    6da4:	41 52                	push   %r10
    6da6:	41 53                	push   %r11
    6da8:	41 54                	push   %r12
    6daa:	41 55                	push   %r13
    6dac:	41 56                	push   %r14
    6dae:	41 57                	push   %r15
    6db0:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6db7:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6dbe:	48 31 db             	xor    %rbx,%rbx
    6dc1:	48 31 c9             	xor    %rcx,%rcx
    6dc4:	4d 31 d2             	xor    %r10,%r10
    6dc7:	4d 31 db             	xor    %r11,%r11
    6dca:	4d 31 e4             	xor    %r12,%r12
    6dcd:	4d 31 ed             	xor    %r13,%r13
    6dd0:	4d 31 f6             	xor    %r14,%r14
    6dd3:	4d 31 ff             	xor    %r15,%r15

0000000000006dd6 <addtest_loop>:
    6dd6:	4d 01 c7             	add    %r8,%r15
    6dd9:	4d 01 c6             	add    %r8,%r14
    6ddc:	4d 01 c5             	add    %r8,%r13
    6ddf:	4d 01 c4             	add    %r8,%r12
    6de2:	4d 01 c3             	add    %r8,%r11
    6de5:	4d 01 c2             	add    %r8,%r10
    6de8:	4c 01 c1             	add    %r8,%rcx
    6deb:	4d 01 c7             	add    %r8,%r15
    6dee:	4d 01 c6             	add    %r8,%r14
    6df1:	4d 01 c5             	add    %r8,%r13
    6df4:	4d 01 c4             	add    %r8,%r12
    6df7:	4d 01 c3             	add    %r8,%r11
    6dfa:	4d 01 c2             	add    %r8,%r10
    6dfd:	4c 01 c1             	add    %r8,%rcx
    6e00:	4d 01 c7             	add    %r8,%r15
    6e03:	4d 01 c6             	add    %r8,%r14
    6e06:	4d 01 c5             	add    %r8,%r13
    6e09:	4d 01 c4             	add    %r8,%r12
    6e0c:	4d 01 c3             	add    %r8,%r11
    6e0f:	4d 01 c2             	add    %r8,%r10
    6e12:	4c 29 cf             	sub    %r9,%rdi
    6e15:	75 bf                	jne    6dd6 <addtest_loop>
    6e17:	41 5f                	pop    %r15
    6e19:	41 5e                	pop    %r14
    6e1b:	41 5d                	pop    %r13
    6e1d:	41 5c                	pop    %r12
    6e1f:	41 5b                	pop    %r11
    6e21:	41 5a                	pop    %r10
    6e23:	41 59                	pop    %r9
    6e25:	41 58                	pop    %r8
    6e27:	59                   	pop    %rcx
    6e28:	5b                   	pop    %rbx
    6e29:	c3                   	retq   

0000000000006e2a <addnoptest>:
    6e2a:	53                   	push   %rbx
    6e2b:	51                   	push   %rcx
    6e2c:	41 50                	push   %r8
    6e2e:	41 51                	push   %r9
    6e30:	41 52                	push   %r10
    6e32:	41 53                	push   %r11
    6e34:	41 54                	push   %r12
    6e36:	41 55                	push   %r13
    6e38:	41 56                	push   %r14
    6e3a:	41 57                	push   %r15
    6e3c:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6e43:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6e4a:	48 31 db             	xor    %rbx,%rbx
    6e4d:	48 31 c9             	xor    %rcx,%rcx
    6e50:	4d 31 d2             	xor    %r10,%r10
    6e53:	4d 31 db             	xor    %r11,%r11
    6e56:	4d 31 e4             	xor    %r12,%r12
    6e59:	4d 31 ed             	xor    %r13,%r13
    6e5c:	4d 31 f6             	xor    %r14,%r14
    6e5f:	4d 31 ff             	xor    %r15,%r15

0000000000006e62 <addnoptest_loop>:
    6e62:	4d 01 c7             	add    %r8,%r15
    6e65:	4d 01 c6             	add    %r8,%r14
    6e68:	4d 01 c5             	add    %r8,%r13
    6e6b:	4d 01 c4             	add    %r8,%r12
    6e6e:	90                   	nop
    6e6f:	4d 01 c2             	add    %r8,%r10
    6e72:	4c 01 c1             	add    %r8,%rcx
    6e75:	4d 01 c7             	add    %r8,%r15
    6e78:	4d 01 c6             	add    %r8,%r14
    6e7b:	90                   	nop
    6e7c:	4d 01 c4             	add    %r8,%r12
    6e7f:	4d 01 c3             	add    %r8,%r11
    6e82:	4d 01 c2             	add    %r8,%r10
    6e85:	4c 01 c1             	add    %r8,%rcx
    6e88:	90                   	nop
    6e89:	4d 01 c6             	add    %r8,%r14
    6e8c:	4d 01 c5             	add    %r8,%r13
    6e8f:	4d 01 c4             	add    %r8,%r12
    6e92:	4d 01 c3             	add    %r8,%r11
    6e95:	90                   	nop
    6e96:	4c 29 cf             	sub    %r9,%rdi
    6e99:	75 c7                	jne    6e62 <addnoptest_loop>
    6e9b:	41 5f                	pop    %r15
    6e9d:	41 5e                	pop    %r14
    6e9f:	41 5d                	pop    %r13
    6ea1:	41 5c                	pop    %r12
    6ea3:	41 5b                	pop    %r11
    6ea5:	41 5a                	pop    %r10
    6ea7:	41 59                	pop    %r9
    6ea9:	41 58                	pop    %r8
    6eab:	59                   	pop    %rcx
    6eac:	5b                   	pop    %rbx
    6ead:	c3                   	retq   

0000000000006eae <addmovtest>:
    6eae:	53                   	push   %rbx
    6eaf:	51                   	push   %rcx
    6eb0:	41 50                	push   %r8
    6eb2:	41 51                	push   %r9
    6eb4:	41 52                	push   %r10
    6eb6:	41 53                	push   %r11
    6eb8:	41 54                	push   %r12
    6eba:	41 55                	push   %r13
    6ebc:	41 56                	push   %r14
    6ebe:	41 57                	push   %r15
    6ec0:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6ec7:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6ece:	48 31 db             	xor    %rbx,%rbx
    6ed1:	48 31 c9             	xor    %rcx,%rcx
    6ed4:	4d 31 d2             	xor    %r10,%r10
    6ed7:	4d 31 db             	xor    %r11,%r11
    6eda:	4d 31 e4             	xor    %r12,%r12
    6edd:	4d 31 ed             	xor    %r13,%r13
    6ee0:	4d 31 f6             	xor    %r14,%r14
    6ee3:	4d 31 ff             	xor    %r15,%r15

0000000000006ee6 <addmovtest_loop>:
    6ee6:	4d 01 c7             	add    %r8,%r15
    6ee9:	4d 01 c6             	add    %r8,%r14
    6eec:	4d 01 c5             	add    %r8,%r13
    6eef:	4d 01 c4             	add    %r8,%r12
    6ef2:	4c 89 fa             	mov    %r15,%rdx
    6ef5:	4d 01 c2             	add    %r8,%r10
    6ef8:	4c 01 c1             	add    %r8,%rcx
    6efb:	4d 01 c7             	add    %r8,%r15
    6efe:	4d 01 c6             	add    %r8,%r14
    6f01:	4c 89 fa             	mov    %r15,%rdx
    6f04:	4d 01 c4             	add    %r8,%r12
    6f07:	4d 01 c3             	add    %r8,%r11
    6f0a:	4d 01 c2             	add    %r8,%r10
    6f0d:	4c 01 c1             	add    %r8,%rcx
    6f10:	4c 89 fa             	mov    %r15,%rdx
    6f13:	4d 01 c6             	add    %r8,%r14
    6f16:	4d 01 c5             	add    %r8,%r13
    6f19:	4d 01 c4             	add    %r8,%r12
    6f1c:	4d 01 c3             	add    %r8,%r11
    6f1f:	4c 89 fa             	mov    %r15,%rdx
    6f22:	4c 29 cf             	sub    %r9,%rdi
    6f25:	75 bf                	jne    6ee6 <addmovtest_loop>
    6f27:	41 5f                	pop    %r15
    6f29:	41 5e                	pop    %r14
    6f2b:	41 5d                	pop    %r13
    6f2d:	41 5c                	pop    %r12
    6f2f:	41 5b                	pop    %r11
    6f31:	41 5a                	pop    %r10
    6f33:	41 59                	pop    %r9
    6f35:	41 58                	pop    %r8
    6f37:	59                   	pop    %rcx
    6f38:	5b                   	pop    %rbx
    6f39:	c3                   	retq   

0000000000006f3a <rortest>:
    6f3a:	53                   	push   %rbx
    6f3b:	51                   	push   %rcx
    6f3c:	41 50                	push   %r8
    6f3e:	41 51                	push   %r9
    6f40:	41 52                	push   %r10
    6f42:	41 53                	push   %r11
    6f44:	41 54                	push   %r12
    6f46:	41 55                	push   %r13
    6f48:	41 56                	push   %r14
    6f4a:	41 57                	push   %r15
    6f4c:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6f53:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6f5a:	4c 89 c3             	mov    %r8,%rbx
    6f5d:	4c 89 c1             	mov    %r8,%rcx
    6f60:	4d 89 c2             	mov    %r8,%r10
    6f63:	4d 89 c3             	mov    %r8,%r11
    6f66:	4d 89 c4             	mov    %r8,%r12
    6f69:	4d 89 c5             	mov    %r8,%r13
    6f6c:	4d 89 c6             	mov    %r8,%r14
    6f6f:	4d 89 c7             	mov    %r8,%r15

0000000000006f72 <rortest_loop>:
    6f72:	49 d1 cf             	ror    %r15
    6f75:	49 d1 ce             	ror    %r14
    6f78:	49 d1 cd             	ror    %r13
    6f7b:	49 d1 cc             	ror    %r12
    6f7e:	49 d1 cb             	ror    %r11
    6f81:	49 d1 cf             	ror    %r15
    6f84:	49 d1 ce             	ror    %r14
    6f87:	49 d1 cd             	ror    %r13
    6f8a:	49 d1 cc             	ror    %r12
    6f8d:	49 d1 cb             	ror    %r11
    6f90:	49 d1 cf             	ror    %r15
    6f93:	49 d1 ce             	ror    %r14
    6f96:	49 d1 cd             	ror    %r13
    6f99:	49 d1 cc             	ror    %r12
    6f9c:	49 d1 cb             	ror    %r11
    6f9f:	49 d1 cf             	ror    %r15
    6fa2:	49 d1 ce             	ror    %r14
    6fa5:	49 d1 cd             	ror    %r13
    6fa8:	49 d1 cc             	ror    %r12
    6fab:	49 d1 cb             	ror    %r11
    6fae:	4c 29 cf             	sub    %r9,%rdi
    6fb1:	75 bf                	jne    6f72 <rortest_loop>
    6fb3:	41 5f                	pop    %r15
    6fb5:	41 5e                	pop    %r14
    6fb7:	41 5d                	pop    %r13
    6fb9:	41 5c                	pop    %r12
    6fbb:	41 5b                	pop    %r11
    6fbd:	41 5a                	pop    %r10
    6fbf:	41 59                	pop    %r9
    6fc1:	41 58                	pop    %r8
    6fc3:	59                   	pop    %rcx
    6fc4:	5b                   	pop    %rbx
    6fc5:	c3                   	retq   

0000000000006fc6 <shltest>:
    6fc6:	53                   	push   %rbx
    6fc7:	51                   	push   %rcx
    6fc8:	41 50                	push   %r8
    6fca:	41 51                	push   %r9
    6fcc:	41 52                	push   %r10
    6fce:	41 53                	push   %r11
    6fd0:	41 54                	push   %r12
    6fd2:	41 55                	push   %r13
    6fd4:	41 56                	push   %r14
    6fd6:	41 57                	push   %r15
    6fd8:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    6fdf:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    6fe6:	4c 89 c3             	mov    %r8,%rbx
    6fe9:	4c 89 c1             	mov    %r8,%rcx
    6fec:	4d 89 c2             	mov    %r8,%r10
    6fef:	4d 89 c3             	mov    %r8,%r11
    6ff2:	4d 89 c4             	mov    %r8,%r12
    6ff5:	4d 89 c5             	mov    %r8,%r13
    6ff8:	4d 89 c6             	mov    %r8,%r14
    6ffb:	4d 89 c7             	mov    %r8,%r15

0000000000006ffe <shltest_loop>:
    6ffe:	49 d1 e7             	shl    %r15
    7001:	49 d1 e6             	shl    %r14
    7004:	49 d1 e5             	shl    %r13
    7007:	49 d1 e4             	shl    %r12
    700a:	49 d1 e3             	shl    %r11
    700d:	49 d1 e7             	shl    %r15
    7010:	49 d1 e6             	shl    %r14
    7013:	49 d1 e5             	shl    %r13
    7016:	49 d1 e4             	shl    %r12
    7019:	49 d1 e3             	shl    %r11
    701c:	49 d1 e7             	shl    %r15
    701f:	49 d1 e6             	shl    %r14
    7022:	49 d1 e5             	shl    %r13
    7025:	49 d1 e4             	shl    %r12
    7028:	49 d1 e3             	shl    %r11
    702b:	49 d1 e7             	shl    %r15
    702e:	49 d1 e6             	shl    %r14
    7031:	49 d1 e5             	shl    %r13
    7034:	49 d1 e4             	shl    %r12
    7037:	49 d1 e3             	shl    %r11
    703a:	4c 29 cf             	sub    %r9,%rdi
    703d:	75 bf                	jne    6ffe <shltest_loop>
    703f:	41 5f                	pop    %r15
    7041:	41 5e                	pop    %r14
    7043:	41 5d                	pop    %r13
    7045:	41 5c                	pop    %r12
    7047:	41 5b                	pop    %r11
    7049:	41 5a                	pop    %r10
    704b:	41 59                	pop    %r9
    704d:	41 58                	pop    %r8
    704f:	59                   	pop    %rcx
    7050:	5b                   	pop    %rbx
    7051:	c3                   	retq   

0000000000007052 <mixrorshltest>:
    7052:	53                   	push   %rbx
    7053:	51                   	push   %rcx
    7054:	41 50                	push   %r8
    7056:	41 51                	push   %r9
    7058:	41 52                	push   %r10
    705a:	41 53                	push   %r11
    705c:	41 54                	push   %r12
    705e:	41 55                	push   %r13
    7060:	41 56                	push   %r14
    7062:	41 57                	push   %r15
    7064:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    706b:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7072:	4c 89 c3             	mov    %r8,%rbx
    7075:	4c 89 c1             	mov    %r8,%rcx
    7078:	4d 89 c2             	mov    %r8,%r10
    707b:	4d 89 c3             	mov    %r8,%r11
    707e:	4d 89 c4             	mov    %r8,%r12
    7081:	4d 89 c5             	mov    %r8,%r13
    7084:	4d 89 c6             	mov    %r8,%r14
    7087:	4d 89 c7             	mov    %r8,%r15

000000000000708a <mixrorshltest_loop>:
    708a:	49 d1 cf             	ror    %r15
    708d:	49 d1 e6             	shl    %r14
    7090:	49 d1 cd             	ror    %r13
    7093:	49 d1 e4             	shl    %r12
    7096:	49 d1 cb             	ror    %r11
    7099:	49 d1 e7             	shl    %r15
    709c:	49 d1 ce             	ror    %r14
    709f:	49 d1 e5             	shl    %r13
    70a2:	49 d1 cc             	ror    %r12
    70a5:	49 d1 e3             	shl    %r11
    70a8:	49 d1 cf             	ror    %r15
    70ab:	49 d1 e6             	shl    %r14
    70ae:	49 d1 cd             	ror    %r13
    70b1:	49 d1 e4             	shl    %r12
    70b4:	49 d1 cb             	ror    %r11
    70b7:	49 d1 e7             	shl    %r15
    70ba:	49 d1 ce             	ror    %r14
    70bd:	49 d1 e5             	shl    %r13
    70c0:	49 d1 cc             	ror    %r12
    70c3:	49 d1 e3             	shl    %r11
    70c6:	4c 29 cf             	sub    %r9,%rdi
    70c9:	75 bf                	jne    708a <mixrorshltest_loop>
    70cb:	41 5f                	pop    %r15
    70cd:	41 5e                	pop    %r14
    70cf:	41 5d                	pop    %r13
    70d1:	41 5c                	pop    %r12
    70d3:	41 5b                	pop    %r11
    70d5:	41 5a                	pop    %r10
    70d7:	41 59                	pop    %r9
    70d9:	41 58                	pop    %r8
    70db:	59                   	pop    %rcx
    70dc:	5b                   	pop    %rbx
    70dd:	c3                   	retq   

00000000000070de <mixrormultest>:
    70de:	53                   	push   %rbx
    70df:	51                   	push   %rcx
    70e0:	56                   	push   %rsi
    70e1:	52                   	push   %rdx
    70e2:	41 50                	push   %r8
    70e4:	41 51                	push   %r9
    70e6:	41 52                	push   %r10
    70e8:	41 53                	push   %r11
    70ea:	41 54                	push   %r12
    70ec:	41 55                	push   %r13
    70ee:	41 56                	push   %r14
    70f0:	41 57                	push   %r15
    70f2:	49 c7 c0 03 00 00 00 	mov    $0x3,%r8
    70f9:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7100:	4c 89 c3             	mov    %r8,%rbx
    7103:	4c 89 c1             	mov    %r8,%rcx
    7106:	4d 89 c2             	mov    %r8,%r10
    7109:	4d 89 c3             	mov    %r8,%r11
    710c:	4d 89 c4             	mov    %r8,%r12
    710f:	4d 89 c5             	mov    %r8,%r13
    7112:	4d 89 c6             	mov    %r8,%r14
    7115:	4d 89 c7             	mov    %r8,%r15

0000000000007118 <mixrormultest_loop>:
    7118:	49 d1 cf             	ror    %r15
    711b:	4d 0f af f0          	imul   %r8,%r14
    711f:	4d 89 ce             	mov    %r9,%r14
    7122:	49 d1 cd             	ror    %r13
    7125:	4d 0f af e0          	imul   %r8,%r12
    7129:	4d 89 cc             	mov    %r9,%r12
    712c:	49 d1 cb             	ror    %r11
    712f:	4d 0f af d0          	imul   %r8,%r10
    7133:	4d 89 ca             	mov    %r9,%r10
    7136:	48 d1 cb             	ror    %rbx
    7139:	49 0f af c8          	imul   %r8,%rcx
    713d:	4c 89 c9             	mov    %r9,%rcx
    7140:	48 d1 ce             	ror    %rsi
    7143:	49 0f af c0          	imul   %r8,%rax
    7147:	4c 89 c8             	mov    %r9,%rax
    714a:	49 d1 cf             	ror    %r15
    714d:	4d 0f af f0          	imul   %r8,%r14
    7151:	4d 89 ce             	mov    %r9,%r14
    7154:	49 d1 cd             	ror    %r13
    7157:	4d 0f af e0          	imul   %r8,%r12
    715b:	4d 89 cc             	mov    %r9,%r12
    715e:	49 d1 cb             	ror    %r11
    7161:	4d 0f af d0          	imul   %r8,%r10
    7165:	4d 89 ca             	mov    %r9,%r10
    7168:	48 d1 cb             	ror    %rbx
    716b:	49 0f af c8          	imul   %r8,%rcx
    716f:	4c 89 c9             	mov    %r9,%rcx
    7172:	48 d1 ce             	ror    %rsi
    7175:	49 0f af d0          	imul   %r8,%rdx
    7179:	4c 29 cf             	sub    %r9,%rdi
    717c:	75 9a                	jne    7118 <mixrormultest_loop>
    717e:	41 5f                	pop    %r15
    7180:	41 5e                	pop    %r14
    7182:	41 5d                	pop    %r13
    7184:	41 5c                	pop    %r12
    7186:	41 5b                	pop    %r11
    7188:	41 5a                	pop    %r10
    718a:	41 59                	pop    %r9
    718c:	41 58                	pop    %r8
    718e:	5a                   	pop    %rdx
    718f:	5e                   	pop    %rsi
    7190:	59                   	pop    %rcx
    7191:	5b                   	pop    %rbx
    7192:	c3                   	retq   

0000000000007193 <rorbtstest>:
    7193:	53                   	push   %rbx
    7194:	51                   	push   %rcx
    7195:	52                   	push   %rdx
    7196:	56                   	push   %rsi
    7197:	41 50                	push   %r8
    7199:	41 51                	push   %r9
    719b:	41 52                	push   %r10
    719d:	41 53                	push   %r11
    719f:	41 54                	push   %r12
    71a1:	41 55                	push   %r13
    71a3:	41 56                	push   %r14
    71a5:	41 57                	push   %r15
    71a7:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    71ae:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    71b5:	4c 89 c3             	mov    %r8,%rbx
    71b8:	4c 89 c1             	mov    %r8,%rcx
    71bb:	4d 89 c2             	mov    %r8,%r10
    71be:	4d 89 c3             	mov    %r8,%r11
    71c1:	4d 89 c4             	mov    %r8,%r12
    71c4:	4d 89 c5             	mov    %r8,%r13
    71c7:	4d 89 c6             	mov    %r8,%r14
    71ca:	4d 89 c7             	mov    %r8,%r15
    71cd:	49 ff c0             	inc    %r8

00000000000071d0 <rorbtstest_loop>:
    71d0:	4d 0f ab c7          	bts    %r8,%r15
    71d4:	49 d1 ce             	ror    %r14
    71d7:	4d 0f ab c5          	bts    %r8,%r13
    71db:	49 d1 cc             	ror    %r12
    71de:	4d 0f ab c3          	bts    %r8,%r11
    71e2:	49 d1 ca             	ror    %r10
    71e5:	4c 0f ab c1          	bts    %r8,%rcx
    71e9:	48 d1 cb             	ror    %rbx
    71ec:	4c 0f ab c2          	bts    %r8,%rdx
    71f0:	48 d1 ce             	ror    %rsi
    71f3:	4d 0f ab c7          	bts    %r8,%r15
    71f7:	49 d1 ce             	ror    %r14
    71fa:	4d 0f ab c5          	bts    %r8,%r13
    71fe:	49 d1 cc             	ror    %r12
    7201:	4d 0f ab c3          	bts    %r8,%r11
    7205:	49 d1 ca             	ror    %r10
    7208:	4c 0f ab c1          	bts    %r8,%rcx
    720c:	48 d1 cb             	ror    %rbx
    720f:	4c 0f ab c2          	bts    %r8,%rdx
    7213:	48 d1 ce             	ror    %rsi
    7216:	4c 29 cf             	sub    %r9,%rdi
    7219:	75 b5                	jne    71d0 <rorbtstest_loop>
    721b:	41 5f                	pop    %r15
    721d:	41 5e                	pop    %r14
    721f:	41 5d                	pop    %r13
    7221:	41 5c                	pop    %r12
    7223:	41 5b                	pop    %r11
    7225:	41 5a                	pop    %r10
    7227:	41 59                	pop    %r9
    7229:	41 58                	pop    %r8
    722b:	5e                   	pop    %rsi
    722c:	5a                   	pop    %rdx
    722d:	59                   	pop    %rcx
    722e:	5b                   	pop    %rbx
    722f:	c3                   	retq   

0000000000007230 <btstest>:
    7230:	53                   	push   %rbx
    7231:	51                   	push   %rcx
    7232:	41 50                	push   %r8
    7234:	41 51                	push   %r9
    7236:	41 52                	push   %r10
    7238:	41 53                	push   %r11
    723a:	41 54                	push   %r12
    723c:	41 55                	push   %r13
    723e:	41 56                	push   %r14
    7240:	41 57                	push   %r15
    7242:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    7249:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7250:	4c 89 c3             	mov    %r8,%rbx
    7253:	4c 89 c1             	mov    %r8,%rcx
    7256:	4d 89 c2             	mov    %r8,%r10
    7259:	4d 89 c3             	mov    %r8,%r11
    725c:	4d 89 c4             	mov    %r8,%r12
    725f:	4d 89 c5             	mov    %r8,%r13
    7262:	4d 89 c6             	mov    %r8,%r14
    7265:	4d 89 c7             	mov    %r8,%r15
    7268:	49 ff c0             	inc    %r8

000000000000726b <btstest_loop>:
    726b:	4d 0f ab c7          	bts    %r8,%r15
    726f:	4d 0f ab c6          	bts    %r8,%r14
    7273:	4d 0f ab c5          	bts    %r8,%r13
    7277:	4d 0f ab c4          	bts    %r8,%r12
    727b:	4d 0f ab c3          	bts    %r8,%r11
    727f:	4d 0f ab c7          	bts    %r8,%r15
    7283:	4d 0f ab c6          	bts    %r8,%r14
    7287:	4d 0f ab c5          	bts    %r8,%r13
    728b:	4d 0f ab c4          	bts    %r8,%r12
    728f:	4d 0f ab c3          	bts    %r8,%r11
    7293:	4d 0f ab c7          	bts    %r8,%r15
    7297:	4d 0f ab c6          	bts    %r8,%r14
    729b:	4d 0f ab c5          	bts    %r8,%r13
    729f:	4d 0f ab c4          	bts    %r8,%r12
    72a3:	4d 0f ab c3          	bts    %r8,%r11
    72a7:	4d 0f ab c7          	bts    %r8,%r15
    72ab:	4d 0f ab c6          	bts    %r8,%r14
    72af:	4d 0f ab c5          	bts    %r8,%r13
    72b3:	4d 0f ab c4          	bts    %r8,%r12
    72b7:	4d 0f ab c3          	bts    %r8,%r11
    72bb:	4c 29 cf             	sub    %r9,%rdi
    72be:	75 ab                	jne    726b <btstest_loop>
    72c0:	41 5f                	pop    %r15
    72c2:	41 5e                	pop    %r14
    72c4:	41 5d                	pop    %r13
    72c6:	41 5c                	pop    %r12
    72c8:	41 5b                	pop    %r11
    72ca:	41 5a                	pop    %r10
    72cc:	41 59                	pop    %r9
    72ce:	41 58                	pop    %r8
    72d0:	59                   	pop    %rcx
    72d1:	5b                   	pop    %rbx
    72d2:	c3                   	retq   

00000000000072d3 <leatest>:
    72d3:	53                   	push   %rbx
    72d4:	51                   	push   %rcx
    72d5:	41 50                	push   %r8
    72d7:	41 51                	push   %r9
    72d9:	41 52                	push   %r10
    72db:	41 53                	push   %r11
    72dd:	41 54                	push   %r12
    72df:	41 55                	push   %r13
    72e1:	41 56                	push   %r14
    72e3:	41 57                	push   %r15
    72e5:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    72ec:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    72f3:	4c 89 c3             	mov    %r8,%rbx
    72f6:	4c 89 c1             	mov    %r8,%rcx
    72f9:	4d 89 c2             	mov    %r8,%r10
    72fc:	4d 89 c3             	mov    %r8,%r11
    72ff:	4d 89 c4             	mov    %r8,%r12
    7302:	4d 89 c5             	mov    %r8,%r13
    7305:	4d 89 c6             	mov    %r8,%r14
    7308:	4d 89 c7             	mov    %r8,%r15
    730b:	49 ff c0             	inc    %r8

000000000000730e <leatest_loop>:
    730e:	4f 8d 14 d1          	lea    (%r9,%r10,8),%r10
    7312:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    7316:	4f 8d 24 e1          	lea    (%r9,%r12,8),%r12
    731a:	4f 8d 2c e9          	lea    (%r9,%r13,8),%r13
    731e:	4f 8d 34 f1          	lea    (%r9,%r14,8),%r14
    7322:	4f 8d 3c f9          	lea    (%r9,%r15,8),%r15
    7326:	4f 8d 14 d1          	lea    (%r9,%r10,8),%r10
    732a:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    732e:	4f 8d 24 e1          	lea    (%r9,%r12,8),%r12
    7332:	4f 8d 2c e9          	lea    (%r9,%r13,8),%r13
    7336:	4f 8d 34 f1          	lea    (%r9,%r14,8),%r14
    733a:	4f 8d 3c f9          	lea    (%r9,%r15,8),%r15
    733e:	4f 8d 14 d1          	lea    (%r9,%r10,8),%r10
    7342:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    7346:	4f 8d 24 e1          	lea    (%r9,%r12,8),%r12
    734a:	4f 8d 2c e9          	lea    (%r9,%r13,8),%r13
    734e:	4f 8d 34 f1          	lea    (%r9,%r14,8),%r14
    7352:	4f 8d 3c f9          	lea    (%r9,%r15,8),%r15
    7356:	4f 8d 14 d1          	lea    (%r9,%r10,8),%r10
    735a:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    735e:	4c 29 cf             	sub    %r9,%rdi
    7361:	75 ab                	jne    730e <leatest_loop>
    7363:	41 5f                	pop    %r15
    7365:	41 5e                	pop    %r14
    7367:	41 5d                	pop    %r13
    7369:	41 5c                	pop    %r12
    736b:	41 5b                	pop    %r11
    736d:	41 5a                	pop    %r10
    736f:	41 59                	pop    %r9
    7371:	41 58                	pop    %r8
    7373:	59                   	pop    %rcx
    7374:	5b                   	pop    %rbx
    7375:	c3                   	retq   

0000000000007376 <leamultest>:
    7376:	53                   	push   %rbx
    7377:	51                   	push   %rcx
    7378:	52                   	push   %rdx
    7379:	56                   	push   %rsi
    737a:	41 50                	push   %r8
    737c:	41 51                	push   %r9
    737e:	41 52                	push   %r10
    7380:	41 53                	push   %r11
    7382:	41 54                	push   %r12
    7384:	41 55                	push   %r13
    7386:	41 56                	push   %r14
    7388:	41 57                	push   %r15
    738a:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    7391:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7398:	4c 89 c3             	mov    %r8,%rbx
    739b:	4c 89 c1             	mov    %r8,%rcx
    739e:	4d 89 c2             	mov    %r8,%r10
    73a1:	4d 89 c3             	mov    %r8,%r11
    73a4:	4d 89 c4             	mov    %r8,%r12
    73a7:	4d 89 c5             	mov    %r8,%r13
    73aa:	4d 89 c6             	mov    %r8,%r14
    73ad:	4d 89 c7             	mov    %r8,%r15
    73b0:	49 ff c0             	inc    %r8

00000000000073b3 <leamultest_loop>:
    73b3:	4f 8d 3c f9          	lea    (%r9,%r15,8),%r15
    73b7:	4d 0f af f0          	imul   %r8,%r14
    73bb:	4d 89 c6             	mov    %r8,%r14
    73be:	4f 8d 2c e9          	lea    (%r9,%r13,8),%r13
    73c2:	4d 0f af e0          	imul   %r8,%r12
    73c6:	4d 89 c4             	mov    %r8,%r12
    73c9:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    73cd:	4d 0f af d0          	imul   %r8,%r10
    73d1:	4d 89 c2             	mov    %r8,%r10
    73d4:	49 8d 1c d9          	lea    (%r9,%rbx,8),%rbx
    73d8:	49 0f af c8          	imul   %r8,%rcx
    73dc:	4c 89 c1             	mov    %r8,%rcx
    73df:	49 8d 14 d1          	lea    (%r9,%rdx,8),%rdx
    73e3:	49 0f af c0          	imul   %r8,%rax
    73e7:	4f 8d 3c f9          	lea    (%r9,%r15,8),%r15
    73eb:	4d 0f af f0          	imul   %r8,%r14
    73ef:	4f 8d 2c e9          	lea    (%r9,%r13,8),%r13
    73f3:	4d 0f af e0          	imul   %r8,%r12
    73f7:	4f 8d 1c d9          	lea    (%r9,%r11,8),%r11
    73fb:	4d 0f af d0          	imul   %r8,%r10
    73ff:	49 8d 1c d9          	lea    (%r9,%rbx,8),%rbx
    7403:	49 0f af c8          	imul   %r8,%rcx
    7407:	49 8d 14 d1          	lea    (%r9,%rdx,8),%rdx
    740b:	49 0f af c0          	imul   %r8,%rax
    740f:	4c 29 cf             	sub    %r9,%rdi
    7412:	75 9f                	jne    73b3 <leamultest_loop>
    7414:	41 5f                	pop    %r15
    7416:	41 5e                	pop    %r14
    7418:	41 5d                	pop    %r13
    741a:	41 5c                	pop    %r12
    741c:	41 5b                	pop    %r11
    741e:	41 5a                	pop    %r10
    7420:	41 59                	pop    %r9
    7422:	41 58                	pop    %r8
    7424:	5e                   	pop    %rsi
    7425:	5a                   	pop    %rdx
    7426:	59                   	pop    %rcx
    7427:	5b                   	pop    %rbx
    7428:	c3                   	retq   

0000000000007429 <btsmultest>:
    7429:	53                   	push   %rbx
    742a:	51                   	push   %rcx
    742b:	56                   	push   %rsi
    742c:	52                   	push   %rdx
    742d:	41 50                	push   %r8
    742f:	41 51                	push   %r9
    7431:	41 52                	push   %r10
    7433:	41 53                	push   %r11
    7435:	41 54                	push   %r12
    7437:	41 55                	push   %r13
    7439:	41 56                	push   %r14
    743b:	41 57                	push   %r15
    743d:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    7444:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    744b:	4c 89 c3             	mov    %r8,%rbx
    744e:	4c 89 c2             	mov    %r8,%rdx
    7451:	4c 89 c6             	mov    %r8,%rsi
    7454:	4c 89 c1             	mov    %r8,%rcx
    7457:	4d 89 c2             	mov    %r8,%r10
    745a:	4d 89 c3             	mov    %r8,%r11
    745d:	4d 89 c4             	mov    %r8,%r12
    7460:	4d 89 c5             	mov    %r8,%r13
    7463:	4d 89 c6             	mov    %r8,%r14
    7466:	4d 89 c7             	mov    %r8,%r15
    7469:	49 ff c0             	inc    %r8

000000000000746c <btsmultest_loop>:
    746c:	4d 0f af f0          	imul   %r8,%r14
    7470:	4d 0f ab c5          	bts    %r8,%r13
    7474:	4d 89 c5             	mov    %r8,%r13
    7477:	4d 0f af e0          	imul   %r8,%r12
    747b:	4d 0f ab c3          	bts    %r8,%r11
    747f:	4d 89 c3             	mov    %r8,%r11
    7482:	4d 0f af d0          	imul   %r8,%r10
    7486:	4c 0f ab c3          	bts    %r8,%rbx
    748a:	49 0f af c8          	imul   %r8,%rcx
    748e:	4c 89 c1             	mov    %r8,%rcx
    7491:	4c 0f ab c6          	bts    %r8,%rsi
    7495:	49 0f af c0          	imul   %r8,%rax
    7499:	4c 89 c0             	mov    %r8,%rax
    749c:	4d 0f ab c7          	bts    %r8,%r15
    74a0:	4d 0f af f0          	imul   %r8,%r14
    74a4:	4d 89 c6             	mov    %r8,%r14
    74a7:	4d 0f ab c5          	bts    %r8,%r13
    74ab:	4d 0f af e0          	imul   %r8,%r12
    74af:	4d 89 c4             	mov    %r8,%r12
    74b2:	4d 0f ab c3          	bts    %r8,%r11
    74b6:	4d 0f af d0          	imul   %r8,%r10
    74ba:	4d 89 c2             	mov    %r8,%r10
    74bd:	4c 0f ab c3          	bts    %r8,%rbx
    74c1:	49 0f af c8          	imul   %r8,%rcx
    74c5:	4c 89 c1             	mov    %r8,%rcx
    74c8:	4c 0f ab c6          	bts    %r8,%rsi
    74cc:	49 0f af d0          	imul   %r8,%rdx
    74d0:	4c 89 c2             	mov    %r8,%rdx
    74d3:	4d 0f ab c3          	bts    %r8,%r11
    74d7:	4c 29 cf             	sub    %r9,%rdi
    74da:	75 90                	jne    746c <btsmultest_loop>
    74dc:	41 5f                	pop    %r15
    74de:	41 5e                	pop    %r14
    74e0:	41 5d                	pop    %r13
    74e2:	41 5c                	pop    %r12
    74e4:	41 5b                	pop    %r11
    74e6:	41 5a                	pop    %r10
    74e8:	41 59                	pop    %r9
    74ea:	41 58                	pop    %r8
    74ec:	5a                   	pop    %rdx
    74ed:	5e                   	pop    %rsi
    74ee:	59                   	pop    %rcx
    74ef:	5b                   	pop    %rbx
    74f0:	c3                   	retq   

00000000000074f1 <jmptest>:
    74f1:	56                   	push   %rsi
    74f2:	53                   	push   %rbx
    74f3:	51                   	push   %rcx
    74f4:	52                   	push   %rdx
    74f5:	41 50                	push   %r8
    74f7:	41 51                	push   %r9
    74f9:	41 52                	push   %r10
    74fb:	41 53                	push   %r11
    74fd:	41 54                	push   %r12
    74ff:	41 55                	push   %r13
    7501:	41 56                	push   %r14
    7503:	41 57                	push   %r15
    7505:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    750c:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7513:	48 31 db             	xor    %rbx,%rbx
    7516:	48 31 c9             	xor    %rcx,%rcx
    7519:	4d 31 d2             	xor    %r10,%r10
    751c:	4d 31 db             	xor    %r11,%r11
    751f:	4d 31 e4             	xor    %r12,%r12
    7522:	4d 31 ed             	xor    %r13,%r13
    7525:	4d 31 f6             	xor    %r14,%r14
    7528:	4d 31 ff             	xor    %r15,%r15
    752b:	48 31 f6             	xor    %rsi,%rsi
    752e:	4d 89 c2             	mov    %r8,%r10
    7531:	4d 89 c3             	mov    %r8,%r11
    7534:	4c 89 c6             	mov    %r8,%rsi
    7537:	4c 89 c0             	mov    %r8,%rax
    753a:	4c 89 c2             	mov    %r8,%rdx

000000000000753d <jmptest_loop>:
    753d:	eb 04                	jmp    7543 <jmptest1>
    753f:	48 83 c0 01          	add    $0x1,%rax

0000000000007543 <jmptest1>:
    7543:	eb 04                	jmp    7549 <jmptest2>
    7545:	48 83 c0 02          	add    $0x2,%rax

0000000000007549 <jmptest2>:
    7549:	eb 04                	jmp    754f <jmptest3>
    754b:	48 83 c0 03          	add    $0x3,%rax

000000000000754f <jmptest3>:
    754f:	eb 04                	jmp    7555 <jmptest4>
    7551:	48 83 c0 04          	add    $0x4,%rax

0000000000007555 <jmptest4>:
    7555:	eb 04                	jmp    755b <jmptest5>
    7557:	48 83 c0 05          	add    $0x5,%rax

000000000000755b <jmptest5>:
    755b:	eb 04                	jmp    7561 <jmptest6>
    755d:	48 83 c0 06          	add    $0x6,%rax

0000000000007561 <jmptest6>:
    7561:	eb 04                	jmp    7567 <jmptest7>
    7563:	48 83 c0 07          	add    $0x7,%rax

0000000000007567 <jmptest7>:
    7567:	eb 04                	jmp    756d <jmptest8>
    7569:	48 83 c0 08          	add    $0x8,%rax

000000000000756d <jmptest8>:
    756d:	eb 04                	jmp    7573 <jmptest9>
    756f:	48 83 c0 09          	add    $0x9,%rax

0000000000007573 <jmptest9>:
    7573:	eb 04                	jmp    7579 <jmptest10>
    7575:	48 83 c0 0a          	add    $0xa,%rax

0000000000007579 <jmptest10>:
    7579:	eb 04                	jmp    757f <jmptest11>
    757b:	48 83 c0 0b          	add    $0xb,%rax

000000000000757f <jmptest11>:
    757f:	eb 04                	jmp    7585 <jmptest12>
    7581:	48 83 c0 0c          	add    $0xc,%rax

0000000000007585 <jmptest12>:
    7585:	eb 04                	jmp    758b <jmptest13>
    7587:	48 83 c0 0d          	add    $0xd,%rax

000000000000758b <jmptest13>:
    758b:	eb 04                	jmp    7591 <jmptest14>
    758d:	48 83 c0 0e          	add    $0xe,%rax

0000000000007591 <jmptest14>:
    7591:	eb 04                	jmp    7597 <jmptest15>
    7593:	48 83 c0 0f          	add    $0xf,%rax

0000000000007597 <jmptest15>:
    7597:	eb 04                	jmp    759d <jmptest16>
    7599:	48 83 c0 10          	add    $0x10,%rax

000000000000759d <jmptest16>:
    759d:	eb 04                	jmp    75a3 <jmptest17>
    759f:	48 83 c0 11          	add    $0x11,%rax

00000000000075a3 <jmptest17>:
    75a3:	eb 04                	jmp    75a9 <jmptest18>
    75a5:	48 83 c0 12          	add    $0x12,%rax

00000000000075a9 <jmptest18>:
    75a9:	eb 04                	jmp    75af <jmptest19>
    75ab:	48 83 c0 13          	add    $0x13,%rax

00000000000075af <jmptest19>:
    75af:	4c 29 cf             	sub    %r9,%rdi
    75b2:	75 89                	jne    753d <jmptest_loop>

00000000000075b4 <jmptest_jellydonut>:
    75b4:	41 5f                	pop    %r15
    75b6:	41 5e                	pop    %r14
    75b8:	41 5d                	pop    %r13
    75ba:	41 5c                	pop    %r12
    75bc:	41 5b                	pop    %r11
    75be:	41 5a                	pop    %r10
    75c0:	41 59                	pop    %r9
    75c2:	41 58                	pop    %r8
    75c4:	5a                   	pop    %rdx
    75c5:	59                   	pop    %rcx
    75c6:	5b                   	pop    %rbx
    75c7:	5e                   	pop    %rsi
    75c8:	c3                   	retq   

00000000000075c9 <ntjmptest>:
    75c9:	56                   	push   %rsi
    75ca:	53                   	push   %rbx
    75cb:	51                   	push   %rcx
    75cc:	52                   	push   %rdx
    75cd:	41 50                	push   %r8
    75cf:	41 51                	push   %r9
    75d1:	41 52                	push   %r10
    75d3:	41 53                	push   %r11
    75d5:	41 54                	push   %r12
    75d7:	41 55                	push   %r13
    75d9:	41 56                	push   %r14
    75db:	41 57                	push   %r15
    75dd:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    75e4:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    75eb:	48 31 db             	xor    %rbx,%rbx
    75ee:	48 31 c9             	xor    %rcx,%rcx
    75f1:	4d 31 d2             	xor    %r10,%r10
    75f4:	4d 31 db             	xor    %r11,%r11
    75f7:	4d 31 e4             	xor    %r12,%r12
    75fa:	4d 31 ed             	xor    %r13,%r13
    75fd:	4d 31 f6             	xor    %r14,%r14
    7600:	4d 31 ff             	xor    %r15,%r15
    7603:	48 31 f6             	xor    %rsi,%rsi
    7606:	4d 89 c2             	mov    %r8,%r10
    7609:	4d 89 c3             	mov    %r8,%r11
    760c:	4c 89 c6             	mov    %r8,%rsi
    760f:	4c 89 c0             	mov    %r8,%rax
    7612:	4c 89 c2             	mov    %r8,%rdx

0000000000007615 <ntjmptest_loop>:
    7615:	4d 39 c1             	cmp    %r8,%r9
    7618:	0f 84 71 01 00 00    	je     778f <jmpmultest_jellydonut>
    761e:	4d 39 c1             	cmp    %r8,%r9
    7621:	0f 84 68 01 00 00    	je     778f <jmpmultest_jellydonut>
    7627:	4d 39 c1             	cmp    %r8,%r9
    762a:	0f 84 5f 01 00 00    	je     778f <jmpmultest_jellydonut>
    7630:	4d 39 c1             	cmp    %r8,%r9
    7633:	0f 84 56 01 00 00    	je     778f <jmpmultest_jellydonut>
    7639:	4d 39 c1             	cmp    %r8,%r9
    763c:	0f 84 4d 01 00 00    	je     778f <jmpmultest_jellydonut>
    7642:	4d 39 c1             	cmp    %r8,%r9
    7645:	0f 84 44 01 00 00    	je     778f <jmpmultest_jellydonut>
    764b:	4d 39 c1             	cmp    %r8,%r9
    764e:	0f 84 3b 01 00 00    	je     778f <jmpmultest_jellydonut>
    7654:	4d 39 c1             	cmp    %r8,%r9
    7657:	0f 84 32 01 00 00    	je     778f <jmpmultest_jellydonut>
    765d:	4d 39 c1             	cmp    %r8,%r9
    7660:	0f 84 29 01 00 00    	je     778f <jmpmultest_jellydonut>
    7666:	4d 39 c1             	cmp    %r8,%r9
    7669:	0f 84 20 01 00 00    	je     778f <jmpmultest_jellydonut>
    766f:	4d 39 c1             	cmp    %r8,%r9
    7672:	0f 84 17 01 00 00    	je     778f <jmpmultest_jellydonut>
    7678:	4d 39 c1             	cmp    %r8,%r9
    767b:	0f 84 0e 01 00 00    	je     778f <jmpmultest_jellydonut>
    7681:	4d 39 c1             	cmp    %r8,%r9
    7684:	0f 84 05 01 00 00    	je     778f <jmpmultest_jellydonut>
    768a:	4d 39 c1             	cmp    %r8,%r9
    768d:	0f 84 fc 00 00 00    	je     778f <jmpmultest_jellydonut>
    7693:	4d 39 c1             	cmp    %r8,%r9
    7696:	0f 84 f3 00 00 00    	je     778f <jmpmultest_jellydonut>
    769c:	4d 39 c1             	cmp    %r8,%r9
    769f:	0f 84 ea 00 00 00    	je     778f <jmpmultest_jellydonut>
    76a5:	4d 39 c1             	cmp    %r8,%r9
    76a8:	0f 84 e1 00 00 00    	je     778f <jmpmultest_jellydonut>
    76ae:	4d 39 c1             	cmp    %r8,%r9
    76b1:	0f 84 d8 00 00 00    	je     778f <jmpmultest_jellydonut>
    76b7:	4d 39 c1             	cmp    %r8,%r9
    76ba:	0f 84 cf 00 00 00    	je     778f <jmpmultest_jellydonut>
    76c0:	4d 39 c1             	cmp    %r8,%r9
    76c3:	0f 84 c6 00 00 00    	je     778f <jmpmultest_jellydonut>
    76c9:	4c 29 cf             	sub    %r9,%rdi
    76cc:	0f 85 43 ff ff ff    	jne    7615 <ntjmptest_loop>

00000000000076d2 <ntjmptest_jellydonut>:
    76d2:	41 5f                	pop    %r15
    76d4:	41 5e                	pop    %r14
    76d6:	41 5d                	pop    %r13
    76d8:	41 5c                	pop    %r12
    76da:	41 5b                	pop    %r11
    76dc:	41 5a                	pop    %r10
    76de:	41 59                	pop    %r9
    76e0:	41 58                	pop    %r8
    76e2:	5a                   	pop    %rdx
    76e3:	59                   	pop    %rcx
    76e4:	5b                   	pop    %rbx
    76e5:	5e                   	pop    %rsi
    76e6:	c3                   	retq   

00000000000076e7 <jmpmultest>:
    76e7:	56                   	push   %rsi
    76e8:	53                   	push   %rbx
    76e9:	51                   	push   %rcx
    76ea:	52                   	push   %rdx
    76eb:	41 50                	push   %r8
    76ed:	41 51                	push   %r9
    76ef:	41 52                	push   %r10
    76f1:	41 53                	push   %r11
    76f3:	41 54                	push   %r12
    76f5:	41 55                	push   %r13
    76f7:	41 56                	push   %r14
    76f9:	41 57                	push   %r15
    76fb:	49 c7 c0 02 00 00 00 	mov    $0x2,%r8
    7702:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7709:	48 31 db             	xor    %rbx,%rbx
    770c:	48 31 c9             	xor    %rcx,%rcx
    770f:	4d 31 db             	xor    %r11,%r11
    7712:	4d 31 e4             	xor    %r12,%r12
    7715:	4d 31 ed             	xor    %r13,%r13
    7718:	4d 31 f6             	xor    %r14,%r14
    771b:	4d 31 ff             	xor    %r15,%r15
    771e:	48 31 f6             	xor    %rsi,%rsi
    7721:	4d 89 c2             	mov    %r8,%r10
    7724:	4d 89 c3             	mov    %r8,%r11
    7727:	4c 89 c6             	mov    %r8,%rsi
    772a:	4c 89 c0             	mov    %r8,%rax
    772d:	4c 89 c2             	mov    %r8,%rdx

0000000000007730 <jmpmultest_loop>:
    7730:	4d 39 c1             	cmp    %r8,%r9
    7733:	74 5a                	je     778f <jmpmultest_jellydonut>
    7735:	45 0f af d0          	imul   %r8d,%r10d
    7739:	4d 39 c1             	cmp    %r8,%r9
    773c:	74 51                	je     778f <jmpmultest_jellydonut>
    773e:	41 0f af f0          	imul   %r8d,%esi
    7742:	4d 39 c1             	cmp    %r8,%r9
    7745:	74 48                	je     778f <jmpmultest_jellydonut>
    7747:	41 0f af d8          	imul   %r8d,%ebx
    774b:	4d 39 c1             	cmp    %r8,%r9
    774e:	74 3f                	je     778f <jmpmultest_jellydonut>
    7750:	41 0f af d0          	imul   %r8d,%edx
    7754:	4d 39 c1             	cmp    %r8,%r9
    7757:	74 36                	je     778f <jmpmultest_jellydonut>
    7759:	45 0f af d0          	imul   %r8d,%r10d
    775d:	4d 39 c1             	cmp    %r8,%r9
    7760:	74 2d                	je     778f <jmpmultest_jellydonut>
    7762:	41 0f af f0          	imul   %r8d,%esi
    7766:	4d 39 c1             	cmp    %r8,%r9
    7769:	74 24                	je     778f <jmpmultest_jellydonut>
    776b:	41 0f af d8          	imul   %r8d,%ebx
    776f:	4d 39 c1             	cmp    %r8,%r9
    7772:	74 1b                	je     778f <jmpmultest_jellydonut>
    7774:	41 0f af d0          	imul   %r8d,%edx
    7778:	4d 39 c1             	cmp    %r8,%r9
    777b:	74 12                	je     778f <jmpmultest_jellydonut>
    777d:	45 0f af f8          	imul   %r8d,%r15d
    7781:	4d 39 c1             	cmp    %r8,%r9
    7784:	74 09                	je     778f <jmpmultest_jellydonut>
    7786:	45 0f af f0          	imul   %r8d,%r14d
    778a:	4c 29 cf             	sub    %r9,%rdi
    778d:	75 a1                	jne    7730 <jmpmultest_loop>

000000000000778f <jmpmultest_jellydonut>:
    778f:	41 5f                	pop    %r15
    7791:	41 5e                	pop    %r14
    7793:	41 5d                	pop    %r13
    7795:	41 5c                	pop    %r12
    7797:	41 5b                	pop    %r11
    7799:	41 5a                	pop    %r10
    779b:	41 59                	pop    %r9
    779d:	41 58                	pop    %r8
    779f:	5a                   	pop    %rdx
    77a0:	59                   	pop    %rcx
    77a1:	5b                   	pop    %rbx
    77a2:	5e                   	pop    %rsi
    77a3:	c3                   	retq   

00000000000077a4 <addmultest>:
    77a4:	56                   	push   %rsi
    77a5:	53                   	push   %rbx
    77a6:	51                   	push   %rcx
    77a7:	52                   	push   %rdx
    77a8:	41 50                	push   %r8
    77aa:	41 51                	push   %r9
    77ac:	41 52                	push   %r10
    77ae:	41 53                	push   %r11
    77b0:	41 54                	push   %r12
    77b2:	41 55                	push   %r13
    77b4:	41 56                	push   %r14
    77b6:	41 57                	push   %r15
    77b8:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    77bf:	49 c7 c1 28 00 00 00 	mov    $0x28,%r9
    77c6:	48 31 db             	xor    %rbx,%rbx
    77c9:	48 31 c9             	xor    %rcx,%rcx
    77cc:	4d 31 d2             	xor    %r10,%r10
    77cf:	4d 31 db             	xor    %r11,%r11
    77d2:	4d 31 e4             	xor    %r12,%r12
    77d5:	4d 31 ed             	xor    %r13,%r13
    77d8:	4d 31 f6             	xor    %r14,%r14
    77db:	4d 31 ff             	xor    %r15,%r15
    77de:	48 31 f6             	xor    %rsi,%rsi
    77e1:	4d 89 c2             	mov    %r8,%r10
    77e4:	4d 89 c3             	mov    %r8,%r11
    77e7:	4c 89 c6             	mov    %r8,%rsi
    77ea:	4c 89 c0             	mov    %r8,%rax
    77ed:	4c 89 c2             	mov    %r8,%rdx

00000000000077f0 <addmultest_loop>:
    77f0:	4d 01 c7             	add    %r8,%r15
    77f3:	4d 01 c6             	add    %r8,%r14
    77f6:	4d 01 c5             	add    %r8,%r13
    77f9:	4d 01 c4             	add    %r8,%r12
    77fc:	4d 0f af d0          	imul   %r8,%r10
    7800:	4d 01 c7             	add    %r8,%r15
    7803:	4d 01 c6             	add    %r8,%r14
    7806:	4d 01 c5             	add    %r8,%r13
    7809:	4d 01 c4             	add    %r8,%r12
    780c:	49 0f af f0          	imul   %r8,%rsi
    7810:	4d 01 c7             	add    %r8,%r15
    7813:	4d 01 c6             	add    %r8,%r14
    7816:	4d 01 c5             	add    %r8,%r13
    7819:	4d 01 c4             	add    %r8,%r12
    781c:	49 0f af d8          	imul   %r8,%rbx
    7820:	4d 01 c7             	add    %r8,%r15
    7823:	4d 01 c7             	add    %r8,%r15
    7826:	4d 01 c5             	add    %r8,%r13
    7829:	4d 01 c4             	add    %r8,%r12
    782c:	49 0f af d0          	imul   %r8,%rdx
    7830:	4d 01 c7             	add    %r8,%r15
    7833:	4d 01 c6             	add    %r8,%r14
    7836:	4d 01 c5             	add    %r8,%r13
    7839:	4d 01 c4             	add    %r8,%r12
    783c:	4d 0f af d0          	imul   %r8,%r10
    7840:	4d 01 c7             	add    %r8,%r15
    7843:	4d 01 c6             	add    %r8,%r14
    7846:	4d 01 c5             	add    %r8,%r13
    7849:	4d 01 c4             	add    %r8,%r12
    784c:	49 0f af f0          	imul   %r8,%rsi
    7850:	4d 01 c7             	add    %r8,%r15
    7853:	4d 01 c6             	add    %r8,%r14
    7856:	4d 01 c5             	add    %r8,%r13
    7859:	4d 01 c4             	add    %r8,%r12
    785c:	49 0f af d8          	imul   %r8,%rbx
    7860:	4d 01 c7             	add    %r8,%r15
    7863:	4d 01 c5             	add    %r8,%r13
    7866:	4d 01 c4             	add    %r8,%r12
    7869:	49 0f af d0          	imul   %r8,%rdx
    786d:	4c 29 cf             	sub    %r9,%rdi
    7870:	0f 85 7a ff ff ff    	jne    77f0 <addmultest_loop>
    7876:	41 5f                	pop    %r15
    7878:	41 5e                	pop    %r14
    787a:	41 5d                	pop    %r13
    787c:	41 5c                	pop    %r12
    787e:	41 5b                	pop    %r11
    7880:	41 5a                	pop    %r10
    7882:	41 59                	pop    %r9
    7884:	41 58                	pop    %r8
    7886:	5a                   	pop    %rdx
    7887:	59                   	pop    %rcx
    7888:	5b                   	pop    %rbx
    7889:	5e                   	pop    %rsi
    788a:	c3                   	retq   

000000000000788b <add256int>:
    788b:	41 51                	push   %r9
    788d:	41 50                	push   %r8
    788f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7896:	66 49 0f 6e c9       	movq   %r9,%xmm1
    789b:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    789f:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    78a3:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    78a7:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    78ab:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5

00000000000078af <add256int_loop>:
    78af:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    78b3:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    78b7:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    78bb:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    78bf:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    78c3:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    78c7:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    78cb:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    78cf:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    78d3:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    78d7:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    78db:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    78df:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    78e3:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    78e7:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    78eb:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    78ef:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    78f3:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    78f7:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    78fb:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    78ff:	4c 29 cf             	sub    %r9,%rdi
    7902:	75 ab                	jne    78af <add256int_loop>
    7904:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7909:	c5 f8 77             	vzeroupper 
    790c:	41 58                	pop    %r8
    790e:	41 59                	pop    %r9
    7910:	c3                   	retq   

0000000000007911 <mul512int>:
    7911:	41 51                	push   %r9
    7913:	41 50                	push   %r8
    7915:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    791c:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7921:	62 f2 fd 48 59 c1    	vpbroadcastq %xmm1,%zmm0
    7927:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    792d:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    7933:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    7939:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    793f:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000007945 <mul512int_loop>:
    7945:	62 f2 75 48 40 c8    	vpmulld %zmm0,%zmm1,%zmm1
    794b:	62 f2 6d 48 40 d0    	vpmulld %zmm0,%zmm2,%zmm2
    7951:	62 f2 65 48 40 d8    	vpmulld %zmm0,%zmm3,%zmm3
    7957:	62 f2 5d 48 40 e0    	vpmulld %zmm0,%zmm4,%zmm4
    795d:	62 f2 55 48 40 e8    	vpmulld %zmm0,%zmm5,%zmm5
    7963:	62 f2 75 48 40 c8    	vpmulld %zmm0,%zmm1,%zmm1
    7969:	62 f2 6d 48 40 d0    	vpmulld %zmm0,%zmm2,%zmm2
    796f:	62 f2 65 48 40 d8    	vpmulld %zmm0,%zmm3,%zmm3
    7975:	62 f2 5d 48 40 e0    	vpmulld %zmm0,%zmm4,%zmm4
    797b:	62 f2 55 48 40 e8    	vpmulld %zmm0,%zmm5,%zmm5
    7981:	62 f2 75 48 40 c8    	vpmulld %zmm0,%zmm1,%zmm1
    7987:	62 f2 6d 48 40 d0    	vpmulld %zmm0,%zmm2,%zmm2
    798d:	62 f2 65 48 40 d8    	vpmulld %zmm0,%zmm3,%zmm3
    7993:	62 f2 5d 48 40 e0    	vpmulld %zmm0,%zmm4,%zmm4
    7999:	62 f2 55 48 40 e8    	vpmulld %zmm0,%zmm5,%zmm5
    799f:	62 f2 75 48 40 c8    	vpmulld %zmm0,%zmm1,%zmm1
    79a5:	62 f2 6d 48 40 d0    	vpmulld %zmm0,%zmm2,%zmm2
    79ab:	62 f2 65 48 40 d8    	vpmulld %zmm0,%zmm3,%zmm3
    79b1:	62 f2 5d 48 40 e0    	vpmulld %zmm0,%zmm4,%zmm4
    79b7:	62 f2 55 48 40 e8    	vpmulld %zmm0,%zmm5,%zmm5
    79bd:	4c 29 cf             	sub    %r9,%rdi
    79c0:	75 83                	jne    7945 <mul512int_loop>
    79c2:	66 48 0f 7e c8       	movq   %xmm1,%rax
    79c7:	c5 f8 77             	vzeroupper 
    79ca:	41 58                	pop    %r8
    79cc:	41 59                	pop    %r9
    79ce:	c3                   	retq   

00000000000079cf <muldq512int>:
    79cf:	41 51                	push   %r9
    79d1:	41 50                	push   %r8
    79d3:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    79da:	66 49 0f 6e c9       	movq   %r9,%xmm1
    79df:	62 f2 fd 48 59 c1    	vpbroadcastq %xmm1,%zmm0
    79e5:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    79eb:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    79f1:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    79f7:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    79fd:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000007a03 <muldq512int_loop>:
    7a03:	62 f2 f5 48 28 c8    	vpmuldq %zmm0,%zmm1,%zmm1
    7a09:	62 f2 ed 48 28 d0    	vpmuldq %zmm0,%zmm2,%zmm2
    7a0f:	62 f2 e5 48 28 d8    	vpmuldq %zmm0,%zmm3,%zmm3
    7a15:	62 f2 dd 48 28 e0    	vpmuldq %zmm0,%zmm4,%zmm4
    7a1b:	62 f2 d5 48 28 e8    	vpmuldq %zmm0,%zmm5,%zmm5
    7a21:	62 f2 f5 48 28 c8    	vpmuldq %zmm0,%zmm1,%zmm1
    7a27:	62 f2 ed 48 28 d0    	vpmuldq %zmm0,%zmm2,%zmm2
    7a2d:	62 f2 e5 48 28 d8    	vpmuldq %zmm0,%zmm3,%zmm3
    7a33:	62 f2 dd 48 28 e0    	vpmuldq %zmm0,%zmm4,%zmm4
    7a39:	62 f2 d5 48 28 e8    	vpmuldq %zmm0,%zmm5,%zmm5
    7a3f:	62 f2 f5 48 28 c8    	vpmuldq %zmm0,%zmm1,%zmm1
    7a45:	62 f2 ed 48 28 d0    	vpmuldq %zmm0,%zmm2,%zmm2
    7a4b:	62 f2 e5 48 28 d8    	vpmuldq %zmm0,%zmm3,%zmm3
    7a51:	62 f2 dd 48 28 e0    	vpmuldq %zmm0,%zmm4,%zmm4
    7a57:	62 f2 d5 48 28 e8    	vpmuldq %zmm0,%zmm5,%zmm5
    7a5d:	62 f2 f5 48 28 c8    	vpmuldq %zmm0,%zmm1,%zmm1
    7a63:	62 f2 ed 48 28 d0    	vpmuldq %zmm0,%zmm2,%zmm2
    7a69:	62 f2 e5 48 28 d8    	vpmuldq %zmm0,%zmm3,%zmm3
    7a6f:	62 f2 dd 48 28 e0    	vpmuldq %zmm0,%zmm4,%zmm4
    7a75:	62 f2 d5 48 28 e8    	vpmuldq %zmm0,%zmm5,%zmm5
    7a7b:	4c 29 cf             	sub    %r9,%rdi
    7a7e:	75 83                	jne    7a03 <muldq512int_loop>
    7a80:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7a85:	c5 f8 77             	vzeroupper 
    7a88:	41 58                	pop    %r8
    7a8a:	41 59                	pop    %r9
    7a8c:	c3                   	retq   

0000000000007a8d <add512int>:
    7a8d:	41 51                	push   %r9
    7a8f:	41 50                	push   %r8
    7a91:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7a98:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7a9d:	62 f2 fd 48 59 c1    	vpbroadcastq %xmm1,%zmm0
    7aa3:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    7aa9:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    7aaf:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    7ab5:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    7abb:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000007ac1 <add512int_loop>:
    7ac1:	62 f1 f5 48 d4 c8    	vpaddq %zmm0,%zmm1,%zmm1
    7ac7:	62 f1 ed 48 d4 d0    	vpaddq %zmm0,%zmm2,%zmm2
    7acd:	62 f1 e5 48 d4 d8    	vpaddq %zmm0,%zmm3,%zmm3
    7ad3:	62 f1 dd 48 d4 e0    	vpaddq %zmm0,%zmm4,%zmm4
    7ad9:	62 f1 d5 48 d4 e8    	vpaddq %zmm0,%zmm5,%zmm5
    7adf:	62 f1 f5 48 d4 c8    	vpaddq %zmm0,%zmm1,%zmm1
    7ae5:	62 f1 ed 48 d4 d0    	vpaddq %zmm0,%zmm2,%zmm2
    7aeb:	62 f1 e5 48 d4 d8    	vpaddq %zmm0,%zmm3,%zmm3
    7af1:	62 f1 dd 48 d4 e0    	vpaddq %zmm0,%zmm4,%zmm4
    7af7:	62 f1 d5 48 d4 e8    	vpaddq %zmm0,%zmm5,%zmm5
    7afd:	62 f1 f5 48 d4 c8    	vpaddq %zmm0,%zmm1,%zmm1
    7b03:	62 f1 ed 48 d4 d0    	vpaddq %zmm0,%zmm2,%zmm2
    7b09:	62 f1 e5 48 d4 d8    	vpaddq %zmm0,%zmm3,%zmm3
    7b0f:	62 f1 dd 48 d4 e0    	vpaddq %zmm0,%zmm4,%zmm4
    7b15:	62 f1 d5 48 d4 e8    	vpaddq %zmm0,%zmm5,%zmm5
    7b1b:	62 f1 f5 48 d4 c8    	vpaddq %zmm0,%zmm1,%zmm1
    7b21:	62 f1 ed 48 d4 d0    	vpaddq %zmm0,%zmm2,%zmm2
    7b27:	62 f1 e5 48 d4 d8    	vpaddq %zmm0,%zmm3,%zmm3
    7b2d:	62 f1 dd 48 d4 e0    	vpaddq %zmm0,%zmm4,%zmm4
    7b33:	62 f1 d5 48 d4 e8    	vpaddq %zmm0,%zmm5,%zmm5
    7b39:	4c 29 cf             	sub    %r9,%rdi
    7b3c:	75 83                	jne    7ac1 <add512int_loop>
    7b3e:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7b43:	c5 f8 77             	vzeroupper 
    7b46:	41 58                	pop    %r8
    7b48:	41 59                	pop    %r9
    7b4a:	c3                   	retq   

0000000000007b4b <mixadd256fpint>:
    7b4b:	41 51                	push   %r9
    7b4d:	41 50                	push   %r8
    7b4f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7b56:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7b5b:	c4 e2 7d 59 c1       	vpbroadcastq %xmm1,%ymm0
    7b60:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    7b64:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    7b68:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    7b6c:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    7b70:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5
    7b74:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    7b79:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    7b7e:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    7b82:	c5 7c 10 c6          	vmovups %ymm6,%ymm8
    7b86:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    7b8a:	c5 7c 10 d6          	vmovups %ymm6,%ymm10
    7b8e:	c5 7c 10 de          	vmovups %ymm6,%ymm11

0000000000007b92 <mixadd256fpint_loop>:
    7b92:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7b96:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7b9a:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7b9e:	c5 3c 58 c6          	vaddps %ymm6,%ymm8,%ymm8
    7ba2:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7ba6:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7baa:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7bae:	c5 2c 58 d6          	vaddps %ymm6,%ymm10,%ymm10
    7bb2:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7bb6:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7bba:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7bbe:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7bc2:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7bc6:	c5 3c 58 c6          	vaddps %ymm6,%ymm8,%ymm8
    7bca:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7bce:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7bd2:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7bd6:	c5 2c 58 d6          	vaddps %ymm6,%ymm10,%ymm10
    7bda:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7bde:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7be2:	4c 29 cf             	sub    %r9,%rdi
    7be5:	75 ab                	jne    7b92 <mixadd256fpint_loop>
    7be7:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7bec:	c5 f8 77             	vzeroupper 
    7bef:	41 58                	pop    %r8
    7bf1:	41 59                	pop    %r9
    7bf3:	c3                   	retq   

0000000000007bf4 <mix256faddintadd>:
    7bf4:	41 51                	push   %r9
    7bf6:	41 50                	push   %r8
    7bf8:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7bff:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7c04:	c4 62 7d 59 c1       	vpbroadcastq %xmm1,%ymm8
    7c09:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    7c0e:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    7c13:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    7c17:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    7c1b:	c5 7c 10 de          	vmovups %ymm6,%ymm11
    7c1f:	c5 7c 10 ee          	vmovups %ymm6,%ymm13
    7c23:	c5 7c 10 fe          	vmovups %ymm6,%ymm15
    7c27:	c4 41 7e 6f d0       	vmovdqu %ymm8,%ymm10
    7c2c:	c4 41 7e 6f e0       	vmovdqu %ymm8,%ymm12
    7c31:	c4 41 7e 6f f0       	vmovdqu %ymm8,%ymm14

0000000000007c36 <mix256faddintadd_loop>:
    7c36:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7c3a:	c4 41 3d fe c0       	vpaddd %ymm8,%ymm8,%ymm8
    7c3f:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7c43:	c4 41 2d fe d2       	vpaddd %ymm10,%ymm10,%ymm10
    7c48:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7c4c:	c4 41 1d fe e4       	vpaddd %ymm12,%ymm12,%ymm12
    7c51:	c5 14 58 ee          	vaddps %ymm6,%ymm13,%ymm13
    7c55:	c4 41 0d fe f6       	vpaddd %ymm14,%ymm14,%ymm14
    7c5a:	c5 04 58 fe          	vaddps %ymm6,%ymm15,%ymm15
    7c5e:	c5 d5 fe ed          	vpaddd %ymm5,%ymm5,%ymm5
    7c62:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7c66:	c4 41 3d fe c0       	vpaddd %ymm8,%ymm8,%ymm8
    7c6b:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7c6f:	c4 41 2d fe d2       	vpaddd %ymm10,%ymm10,%ymm10
    7c74:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7c78:	c4 41 1d fe e4       	vpaddd %ymm12,%ymm12,%ymm12
    7c7d:	c5 14 58 ee          	vaddps %ymm6,%ymm13,%ymm13
    7c81:	c4 41 0d fe f6       	vpaddd %ymm14,%ymm14,%ymm14
    7c86:	c5 04 58 fe          	vaddps %ymm6,%ymm15,%ymm15
    7c8a:	c5 d5 fe ed          	vpaddd %ymm5,%ymm5,%ymm5
    7c8e:	4c 29 cf             	sub    %r9,%rdi
    7c91:	75 a3                	jne    7c36 <mix256faddintadd_loop>
    7c93:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7c98:	c5 f8 77             	vzeroupper 
    7c9b:	41 58                	pop    %r8
    7c9d:	41 59                	pop    %r9
    7c9f:	c3                   	retq   

0000000000007ca0 <mix256fp>:
    7ca0:	41 51                	push   %r9
    7ca2:	41 50                	push   %r8
    7ca4:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7cab:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7cb0:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    7cb5:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    7cba:	c5 fc 10 ee          	vmovups %ymm6,%ymm5
    7cbe:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    7cc2:	c5 7c 10 c6          	vmovups %ymm6,%ymm8
    7cc6:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    7cca:	c5 7c 10 d6          	vmovups %ymm6,%ymm10
    7cce:	c5 7c 10 de          	vmovups %ymm6,%ymm11
    7cd2:	c5 7c 10 e6          	vmovups %ymm6,%ymm12
    7cd6:	c5 7c 10 ee          	vmovups %ymm6,%ymm13
    7cda:	c5 7c 10 f6          	vmovups %ymm6,%ymm14
    7cde:	c5 7c 10 fe          	vmovups %ymm6,%ymm15

0000000000007ce2 <mix256fp_loop>:
    7ce2:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7ce6:	c5 3c 59 c6          	vmulps %ymm6,%ymm8,%ymm8
    7cea:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7cee:	c5 2c 59 d6          	vmulps %ymm6,%ymm10,%ymm10
    7cf2:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7cf6:	c5 1c 59 e6          	vmulps %ymm6,%ymm12,%ymm12
    7cfa:	c5 14 58 ee          	vaddps %ymm6,%ymm13,%ymm13
    7cfe:	c5 0c 59 f6          	vmulps %ymm6,%ymm14,%ymm14
    7d02:	c5 04 58 fe          	vaddps %ymm6,%ymm15,%ymm15
    7d06:	c5 d4 59 ee          	vmulps %ymm6,%ymm5,%ymm5
    7d0a:	c5 c4 58 fe          	vaddps %ymm6,%ymm7,%ymm7
    7d0e:	c5 3c 59 c6          	vmulps %ymm6,%ymm8,%ymm8
    7d12:	c5 34 58 ce          	vaddps %ymm6,%ymm9,%ymm9
    7d16:	c5 2c 59 d6          	vmulps %ymm6,%ymm10,%ymm10
    7d1a:	c5 24 58 de          	vaddps %ymm6,%ymm11,%ymm11
    7d1e:	c5 1c 59 e6          	vmulps %ymm6,%ymm12,%ymm12
    7d22:	c5 14 58 ee          	vaddps %ymm6,%ymm13,%ymm13
    7d26:	c5 0c 59 f6          	vmulps %ymm6,%ymm14,%ymm14
    7d2a:	c5 04 58 fe          	vaddps %ymm6,%ymm15,%ymm15
    7d2e:	c5 d4 59 ee          	vmulps %ymm6,%ymm5,%ymm5
    7d32:	4c 29 cf             	sub    %r9,%rdi
    7d35:	75 ab                	jne    7ce2 <mix256fp_loop>
    7d37:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7d3c:	c5 f8 77             	vzeroupper 
    7d3f:	41 58                	pop    %r8
    7d41:	41 59                	pop    %r9
    7d43:	c3                   	retq   

0000000000007d44 <mixadd256int>:
    7d44:	41 51                	push   %r9
    7d46:	41 50                	push   %r8
    7d48:	41 57                	push   %r15
    7d4a:	41 56                	push   %r14
    7d4c:	41 55                	push   %r13
    7d4e:	41 54                	push   %r12
    7d50:	41 53                	push   %r11
    7d52:	49 c7 c1 1e 00 00 00 	mov    $0x1e,%r9
    7d59:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7d5e:	c4 e2 7d 59 c1       	vpbroadcastq %xmm1,%ymm0
    7d63:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    7d67:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    7d6b:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    7d6f:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    7d73:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5
    7d77:	4d 89 cf             	mov    %r9,%r15
    7d7a:	4d 89 ce             	mov    %r9,%r14
    7d7d:	4d 89 cd             	mov    %r9,%r13
    7d80:	4d 89 cc             	mov    %r9,%r12
    7d83:	4d 89 cb             	mov    %r9,%r11
    7d86:	4d 89 c8             	mov    %r9,%r8

0000000000007d89 <mixadd256int_loop>:
    7d89:	4d 01 c3             	add    %r8,%r11
    7d8c:	4d 01 c4             	add    %r8,%r12
    7d8f:	4d 01 c5             	add    %r8,%r13
    7d92:	4d 01 c6             	add    %r8,%r14
    7d95:	4d 01 c7             	add    %r8,%r15
    7d98:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7d9c:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7da0:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7da4:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7da8:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7dac:	4d 01 c3             	add    %r8,%r11
    7daf:	4d 01 c4             	add    %r8,%r12
    7db2:	4d 01 c5             	add    %r8,%r13
    7db5:	4d 01 c6             	add    %r8,%r14
    7db8:	4d 01 c7             	add    %r8,%r15
    7dbb:	4d 01 c3             	add    %r8,%r11
    7dbe:	4d 01 c4             	add    %r8,%r12
    7dc1:	4d 01 c5             	add    %r8,%r13
    7dc4:	4d 01 c6             	add    %r8,%r14
    7dc7:	4d 01 c7             	add    %r8,%r15
    7dca:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7dce:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7dd2:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7dd6:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7dda:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7dde:	4d 01 c3             	add    %r8,%r11
    7de1:	4d 01 c4             	add    %r8,%r12
    7de4:	4d 01 c5             	add    %r8,%r13
    7de7:	4d 01 c6             	add    %r8,%r14
    7dea:	4d 01 c7             	add    %r8,%r15
    7ded:	4c 29 cf             	sub    %r9,%rdi
    7df0:	75 97                	jne    7d89 <mixadd256int_loop>
    7df2:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7df7:	c5 f8 77             	vzeroupper 
    7dfa:	41 5b                	pop    %r11
    7dfc:	41 5c                	pop    %r12
    7dfe:	41 5d                	pop    %r13
    7e00:	41 5e                	pop    %r14
    7e02:	41 5f                	pop    %r15
    7e04:	41 58                	pop    %r8
    7e06:	41 59                	pop    %r9
    7e08:	c3                   	retq   

0000000000007e09 <mixadd256int11>:
    7e09:	41 51                	push   %r9
    7e0b:	41 50                	push   %r8
    7e0d:	41 57                	push   %r15
    7e0f:	41 56                	push   %r14
    7e11:	41 55                	push   %r13
    7e13:	41 54                	push   %r12
    7e15:	41 53                	push   %r11
    7e17:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7e1e:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7e23:	c4 e2 7d 59 c1       	vpbroadcastq %xmm1,%ymm0
    7e28:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    7e2c:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    7e30:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    7e34:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    7e38:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5
    7e3c:	4d 89 cf             	mov    %r9,%r15
    7e3f:	4d 89 ce             	mov    %r9,%r14
    7e42:	4d 89 cd             	mov    %r9,%r13
    7e45:	4d 89 cc             	mov    %r9,%r12
    7e48:	4d 89 cb             	mov    %r9,%r11
    7e4b:	4d 89 c8             	mov    %r9,%r8

0000000000007e4e <mixadd256int11_loop>:
    7e4e:	4d 01 c3             	add    %r8,%r11
    7e51:	4d 01 c4             	add    %r8,%r12
    7e54:	4d 01 c5             	add    %r8,%r13
    7e57:	4d 01 c6             	add    %r8,%r14
    7e5a:	4d 01 c7             	add    %r8,%r15
    7e5d:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7e61:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7e65:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7e69:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7e6d:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7e71:	4d 01 c3             	add    %r8,%r11
    7e74:	4d 01 c4             	add    %r8,%r12
    7e77:	4d 01 c5             	add    %r8,%r13
    7e7a:	4d 01 c6             	add    %r8,%r14
    7e7d:	4d 01 c7             	add    %r8,%r15
    7e80:	c5 f5 d4 c8          	vpaddq %ymm0,%ymm1,%ymm1
    7e84:	c5 ed d4 d0          	vpaddq %ymm0,%ymm2,%ymm2
    7e88:	c5 e5 d4 d8          	vpaddq %ymm0,%ymm3,%ymm3
    7e8c:	c5 dd d4 e0          	vpaddq %ymm0,%ymm4,%ymm4
    7e90:	c5 d5 d4 e8          	vpaddq %ymm0,%ymm5,%ymm5
    7e94:	4c 29 cf             	sub    %r9,%rdi
    7e97:	75 b5                	jne    7e4e <mixadd256int11_loop>
    7e99:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7e9e:	c5 f8 77             	vzeroupper 
    7ea1:	41 5b                	pop    %r11
    7ea3:	41 5c                	pop    %r12
    7ea5:	41 5d                	pop    %r13
    7ea7:	41 5e                	pop    %r14
    7ea9:	41 5f                	pop    %r15
    7eab:	41 58                	pop    %r8
    7ead:	41 59                	pop    %r9
    7eaf:	c3                   	retq   

0000000000007eb0 <latadd256int>:
    7eb0:	41 51                	push   %r9
    7eb2:	41 50                	push   %r8
    7eb4:	41 57                	push   %r15
    7eb6:	41 56                	push   %r14
    7eb8:	41 55                	push   %r13
    7eba:	41 54                	push   %r12
    7ebc:	41 53                	push   %r11
    7ebe:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7ec5:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7eca:	c4 e2 7d 59 c1       	vpbroadcastq %xmm1,%ymm0
    7ecf:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    7ed3:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    7ed7:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    7edb:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    7edf:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5

0000000000007ee3 <latadd256int_loop>:
    7ee3:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7ee7:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7eeb:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7eef:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7ef3:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7ef7:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7efb:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7eff:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f03:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f07:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f0b:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f0f:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f13:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f17:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f1b:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f1f:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f23:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f27:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f2b:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f2f:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    7f33:	4c 29 cf             	sub    %r9,%rdi
    7f36:	75 ab                	jne    7ee3 <latadd256int_loop>
    7f38:	66 48 0f 7e c8       	movq   %xmm1,%rax
    7f3d:	c5 f8 77             	vzeroupper 
    7f40:	41 5b                	pop    %r11
    7f42:	41 5c                	pop    %r12
    7f44:	41 5d                	pop    %r13
    7f46:	41 5e                	pop    %r14
    7f48:	41 5f                	pop    %r15
    7f4a:	41 58                	pop    %r8
    7f4c:	41 59                	pop    %r9
    7f4e:	c3                   	retq   

0000000000007f4f <latadd512int>:
    7f4f:	41 51                	push   %r9
    7f51:	41 50                	push   %r8
    7f53:	41 57                	push   %r15
    7f55:	41 56                	push   %r14
    7f57:	41 55                	push   %r13
    7f59:	41 54                	push   %r12
    7f5b:	41 53                	push   %r11
    7f5d:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    7f64:	66 49 0f 6e c9       	movq   %r9,%xmm1
    7f69:	62 f2 fd 48 59 c1    	vpbroadcastq %xmm1,%zmm0
    7f6f:	62 f1 fd 48 6f c8    	vmovdqa64 %zmm0,%zmm1
    7f75:	62 f1 fd 48 6f d0    	vmovdqa64 %zmm0,%zmm2
    7f7b:	62 f1 fd 48 6f d8    	vmovdqa64 %zmm0,%zmm3
    7f81:	62 f1 fd 48 6f e0    	vmovdqa64 %zmm0,%zmm4
    7f87:	62 f1 fd 48 6f e8    	vmovdqa64 %zmm0,%zmm5

0000000000007f8d <latadd51a2int_loop>:
    7f8d:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7f93:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7f99:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7f9f:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fa5:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fab:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fb1:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fb7:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fbd:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fc3:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fc9:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fcf:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fd5:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fdb:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fe1:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fe7:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fed:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7ff3:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7ff9:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    7fff:	62 f1 fd 48 d4 c0    	vpaddq %zmm0,%zmm0,%zmm0
    8005:	4c 29 cf             	sub    %r9,%rdi
    8008:	0f 85 d5 fe ff ff    	jne    7ee3 <latadd256int_loop>
    800e:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8013:	c5 f8 77             	vzeroupper 
    8016:	41 5b                	pop    %r11
    8018:	41 5c                	pop    %r12
    801a:	41 5d                	pop    %r13
    801c:	41 5e                	pop    %r14
    801e:	41 5f                	pop    %r15
    8020:	41 58                	pop    %r8
    8022:	41 59                	pop    %r9
    8024:	c3                   	retq   

0000000000008025 <latmul512int>:
    8025:	41 51                	push   %r9
    8027:	41 50                	push   %r8
    8029:	41 57                	push   %r15
    802b:	41 56                	push   %r14
    802d:	41 55                	push   %r13
    802f:	41 54                	push   %r12
    8031:	41 53                	push   %r11
    8033:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    803a:	66 49 0f 6e c9       	movq   %r9,%xmm1
    803f:	62 f2 7d 48 58 c1    	vpbroadcastd %xmm1,%zmm0
    8045:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    804b:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    8051:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    8057:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    805d:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000008063 <latmul512int_loop>:
    8063:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8069:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    806f:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8075:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    807b:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8081:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8087:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    808d:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8093:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    8099:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    809f:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80a5:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80ab:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80b1:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80b7:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80bd:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80c3:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80c9:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80cf:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80d5:	62 f2 7d 48 40 c0    	vpmulld %zmm0,%zmm0,%zmm0
    80db:	4c 29 cf             	sub    %r9,%rdi
    80de:	75 83                	jne    8063 <latmul512int_loop>
    80e0:	66 48 0f 7e c8       	movq   %xmm1,%rax
    80e5:	c5 f8 77             	vzeroupper 
    80e8:	41 5b                	pop    %r11
    80ea:	41 5c                	pop    %r12
    80ec:	41 5d                	pop    %r13
    80ee:	41 5e                	pop    %r14
    80f0:	41 5f                	pop    %r15
    80f2:	41 58                	pop    %r8
    80f4:	41 59                	pop    %r9
    80f6:	c3                   	retq   

00000000000080f7 <latmuldq512int>:
    80f7:	41 51                	push   %r9
    80f9:	41 50                	push   %r8
    80fb:	41 57                	push   %r15
    80fd:	41 56                	push   %r14
    80ff:	41 55                	push   %r13
    8101:	41 54                	push   %r12
    8103:	41 53                	push   %r11
    8105:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    810c:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8111:	62 f2 7d 48 58 c1    	vpbroadcastd %xmm1,%zmm0
    8117:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    811d:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    8123:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    8129:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    812f:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000008135 <latmuldq512int_loop>:
    8135:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    813b:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8141:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8147:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    814d:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8153:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8159:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    815f:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8165:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    816b:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8171:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8177:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    817d:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8183:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8189:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    818f:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    8195:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    819b:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    81a1:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    81a7:	62 f2 fd 48 28 c0    	vpmuldq %zmm0,%zmm0,%zmm0
    81ad:	4c 29 cf             	sub    %r9,%rdi
    81b0:	75 83                	jne    8135 <latmuldq512int_loop>
    81b2:	66 48 0f 7e c8       	movq   %xmm1,%rax
    81b7:	c5 f8 77             	vzeroupper 
    81ba:	41 5b                	pop    %r11
    81bc:	41 5c                	pop    %r12
    81be:	41 5d                	pop    %r13
    81c0:	41 5e                	pop    %r14
    81c2:	41 5f                	pop    %r15
    81c4:	41 58                	pop    %r8
    81c6:	41 59                	pop    %r9
    81c8:	c3                   	retq   

00000000000081c9 <latmulq512int>:
    81c9:	41 51                	push   %r9
    81cb:	41 50                	push   %r8
    81cd:	41 57                	push   %r15
    81cf:	41 56                	push   %r14
    81d1:	41 55                	push   %r13
    81d3:	41 54                	push   %r12
    81d5:	41 53                	push   %r11
    81d7:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    81de:	66 49 0f 6e c9       	movq   %r9,%xmm1
    81e3:	62 f2 7d 48 58 c1    	vpbroadcastd %xmm1,%zmm0
    81e9:	62 f1 fe 48 6f c8    	vmovdqu64 %zmm0,%zmm1
    81ef:	62 f1 fe 48 6f d0    	vmovdqu64 %zmm0,%zmm2
    81f5:	62 f1 fe 48 6f d8    	vmovdqu64 %zmm0,%zmm3
    81fb:	62 f1 fe 48 6f e0    	vmovdqu64 %zmm0,%zmm4
    8201:	62 f1 fe 48 6f e8    	vmovdqu64 %zmm0,%zmm5

0000000000008207 <latmulq512int_loop>:
    8207:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    820d:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8213:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8219:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    821f:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8225:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    822b:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8231:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8237:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    823d:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8243:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8249:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    824f:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8255:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    825b:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8261:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8267:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    826d:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8273:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    8279:	62 f2 fd 48 40 c0    	vpmullq %zmm0,%zmm0,%zmm0
    827f:	4c 29 cf             	sub    %r9,%rdi
    8282:	75 83                	jne    8207 <latmulq512int_loop>
    8284:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8289:	c5 f8 77             	vzeroupper 
    828c:	41 5b                	pop    %r11
    828e:	41 5c                	pop    %r12
    8290:	41 5d                	pop    %r13
    8292:	41 5e                	pop    %r14
    8294:	41 5f                	pop    %r15
    8296:	41 58                	pop    %r8
    8298:	41 59                	pop    %r9
    829a:	c3                   	retq   

000000000000829b <latmul256int>:
    829b:	41 51                	push   %r9
    829d:	41 50                	push   %r8
    829f:	41 57                	push   %r15
    82a1:	41 56                	push   %r14
    82a3:	41 55                	push   %r13
    82a5:	41 54                	push   %r12
    82a7:	41 53                	push   %r11
    82a9:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    82b0:	66 49 0f 6e c9       	movq   %r9,%xmm1
    82b5:	c5 fe 6f c8          	vmovdqu %ymm0,%ymm1
    82b9:	c5 fe 6f d0          	vmovdqu %ymm0,%ymm2
    82bd:	c5 fe 6f d8          	vmovdqu %ymm0,%ymm3
    82c1:	c5 fe 6f e0          	vmovdqu %ymm0,%ymm4
    82c5:	c5 fe 6f e8          	vmovdqu %ymm0,%ymm5

00000000000082c9 <latmul256int_loop>:
    82c9:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82ce:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82d3:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82d8:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82dd:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82e2:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82e7:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82ec:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82f1:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82f6:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    82fb:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8300:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8305:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    830a:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    830f:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8314:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8319:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    831e:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8323:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    8328:	c4 e2 7d 40 c0       	vpmulld %ymm0,%ymm0,%ymm0
    832d:	4c 29 cf             	sub    %r9,%rdi
    8330:	75 97                	jne    82c9 <latmul256int_loop>
    8332:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8337:	c5 f8 77             	vzeroupper 
    833a:	41 5b                	pop    %r11
    833c:	41 5c                	pop    %r12
    833e:	41 5d                	pop    %r13
    8340:	41 5e                	pop    %r14
    8342:	41 5f                	pop    %r15
    8344:	41 58                	pop    %r8
    8346:	41 59                	pop    %r9
    8348:	c3                   	retq   

0000000000008349 <latadd128int>:
    8349:	41 51                	push   %r9
    834b:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8352:	66 49 0f 6e c9       	movq   %r9,%xmm1

0000000000008357 <latadd128int_loop>:
    8357:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    835b:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    835f:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8363:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8367:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    836b:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    836f:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8373:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8377:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    837b:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    837f:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8383:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8387:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    838b:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    838f:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8393:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8397:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    839b:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    839f:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    83a3:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    83a7:	4c 29 cf             	sub    %r9,%rdi
    83aa:	75 ab                	jne    8357 <latadd128int_loop>
    83ac:	66 48 0f 7e c8       	movq   %xmm1,%rax
    83b1:	41 59                	pop    %r9
    83b3:	c3                   	retq   

00000000000083b4 <add128int>:
    83b4:	41 51                	push   %r9
    83b6:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    83bd:	66 49 0f 6e c9       	movq   %r9,%xmm1

00000000000083c2 <add128int_loop>:
    83c2:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    83c6:	66 0f d4 c9          	paddq  %xmm1,%xmm1
    83ca:	66 0f d4 d2          	paddq  %xmm2,%xmm2
    83ce:	66 0f d4 db          	paddq  %xmm3,%xmm3
    83d2:	66 0f d4 e4          	paddq  %xmm4,%xmm4
    83d6:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    83da:	66 0f d4 c9          	paddq  %xmm1,%xmm1
    83de:	66 0f d4 d2          	paddq  %xmm2,%xmm2
    83e2:	66 0f d4 db          	paddq  %xmm3,%xmm3
    83e6:	66 0f d4 e4          	paddq  %xmm4,%xmm4
    83ea:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    83ee:	66 0f d4 c9          	paddq  %xmm1,%xmm1
    83f2:	66 0f d4 d2          	paddq  %xmm2,%xmm2
    83f6:	66 0f d4 db          	paddq  %xmm3,%xmm3
    83fa:	66 0f d4 e4          	paddq  %xmm4,%xmm4
    83fe:	66 0f d4 c0          	paddq  %xmm0,%xmm0
    8402:	66 0f d4 c9          	paddq  %xmm1,%xmm1
    8406:	66 0f d4 d2          	paddq  %xmm2,%xmm2
    840a:	66 0f d4 db          	paddq  %xmm3,%xmm3
    840e:	66 0f d4 e4          	paddq  %xmm4,%xmm4
    8412:	4c 29 cf             	sub    %r9,%rdi
    8415:	75 ab                	jne    83c2 <add128int_loop>
    8417:	66 48 0f 7e c8       	movq   %xmm1,%rax
    841c:	41 59                	pop    %r9
    841e:	c3                   	retq   

000000000000841f <aesenc128>:
    841f:	41 51                	push   %r9
    8421:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8428:	66 49 0f 6e c9       	movq   %r9,%xmm1
    842d:	c5 fc 77             	vzeroall 
    8430:	66 0f ef c0          	pxor   %xmm0,%xmm0
    8434:	66 0f ef c9          	pxor   %xmm1,%xmm1
    8438:	66 0f ef d2          	pxor   %xmm2,%xmm2
    843c:	66 0f ef db          	pxor   %xmm3,%xmm3
    8440:	66 0f ef e4          	pxor   %xmm4,%xmm4
    8444:	66 0f ef ed          	pxor   %xmm5,%xmm5

0000000000008448 <aesenc128_loop>:
    8448:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    844d:	66 0f 38 dc d0       	aesenc %xmm0,%xmm2
    8452:	66 0f 38 dc d8       	aesenc %xmm0,%xmm3
    8457:	66 0f 38 dc e0       	aesenc %xmm0,%xmm4
    845c:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    8461:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    8466:	66 0f 38 dc d0       	aesenc %xmm0,%xmm2
    846b:	66 0f 38 dc d8       	aesenc %xmm0,%xmm3
    8470:	66 0f 38 dc e0       	aesenc %xmm0,%xmm4
    8475:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    847a:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    847f:	66 0f 38 dc d0       	aesenc %xmm0,%xmm2
    8484:	66 0f 38 dc d8       	aesenc %xmm0,%xmm3
    8489:	66 0f 38 dc e0       	aesenc %xmm0,%xmm4
    848e:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    8493:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    8498:	66 0f 38 dc d0       	aesenc %xmm0,%xmm2
    849d:	66 0f 38 dc d8       	aesenc %xmm0,%xmm3
    84a2:	66 0f 38 dc e0       	aesenc %xmm0,%xmm4
    84a7:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    84ac:	4c 29 cf             	sub    %r9,%rdi
    84af:	75 97                	jne    8448 <aesenc128_loop>
    84b1:	66 48 0f 7e c8       	movq   %xmm1,%rax
    84b6:	41 59                	pop    %r9
    84b8:	c3                   	retq   

00000000000084b9 <aesencadd128>:
    84b9:	41 51                	push   %r9
    84bb:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    84c2:	66 49 0f 6e c9       	movq   %r9,%xmm1
    84c7:	c5 fc 77             	vzeroall 
    84ca:	66 0f ef c0          	pxor   %xmm0,%xmm0
    84ce:	66 0f ef c9          	pxor   %xmm1,%xmm1
    84d2:	66 0f ef d2          	pxor   %xmm2,%xmm2
    84d6:	66 0f ef db          	pxor   %xmm3,%xmm3
    84da:	66 0f ef e4          	pxor   %xmm4,%xmm4
    84de:	66 0f ef ed          	pxor   %xmm5,%xmm5
    84e2:	66 0f ef f6          	pxor   %xmm6,%xmm6
    84e6:	66 0f ef ff          	pxor   %xmm7,%xmm7
    84ea:	66 45 0f ef c0       	pxor   %xmm8,%xmm8
    84ef:	66 45 0f ef c9       	pxor   %xmm9,%xmm9
    84f4:	66 45 0f ef d2       	pxor   %xmm10,%xmm10
    84f9:	66 45 0f ef db       	pxor   %xmm11,%xmm11
    84fe:	66 45 0f ef e4       	pxor   %xmm12,%xmm12
    8503:	66 45 0f ef ed       	pxor   %xmm13,%xmm13

0000000000008508 <aesencadd128_loop>:
    8508:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    850d:	66 0f fe d6          	paddd  %xmm6,%xmm2
    8511:	66 0f fe de          	paddd  %xmm6,%xmm3
    8515:	66 0f fe e6          	paddd  %xmm6,%xmm4
    8519:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    851e:	66 0f fe fe          	paddd  %xmm6,%xmm7
    8522:	66 44 0f fe c6       	paddd  %xmm6,%xmm8
    8527:	66 44 0f fe ce       	paddd  %xmm6,%xmm9
    852c:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    8532:	66 0f fe d6          	paddd  %xmm6,%xmm2
    8536:	66 0f fe de          	paddd  %xmm6,%xmm3
    853a:	66 0f fe e6          	paddd  %xmm6,%xmm4
    853e:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    8543:	66 0f fe fe          	paddd  %xmm6,%xmm7
    8547:	66 44 0f fe c6       	paddd  %xmm6,%xmm8
    854c:	66 44 0f fe ce       	paddd  %xmm6,%xmm9
    8551:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    8557:	66 44 0f fe de       	paddd  %xmm6,%xmm11
    855c:	66 44 0f fe e6       	paddd  %xmm6,%xmm12
    8561:	66 44 0f fe ee       	paddd  %xmm6,%xmm13
    8566:	4c 29 cf             	sub    %r9,%rdi
    8569:	75 9d                	jne    8508 <aesencadd128_loop>
    856b:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8570:	41 59                	pop    %r9
    8572:	c3                   	retq   

0000000000008573 <aesencfma128>:
    8573:	41 51                	push   %r9
    8575:	49 c7 c1 0f 00 00 00 	mov    $0xf,%r9
    857c:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8581:	c5 fc 77             	vzeroall 
    8584:	66 0f ef c0          	pxor   %xmm0,%xmm0
    8588:	66 0f ef c9          	pxor   %xmm1,%xmm1
    858c:	0f 57 d2             	xorps  %xmm2,%xmm2
    858f:	0f 57 db             	xorps  %xmm3,%xmm3
    8592:	0f 57 e4             	xorps  %xmm4,%xmm4
    8595:	66 0f ef ed          	pxor   %xmm5,%xmm5
    8599:	0f 57 f6             	xorps  %xmm6,%xmm6
    859c:	0f 57 ff             	xorps  %xmm7,%xmm7
    859f:	45 0f 57 c0          	xorps  %xmm8,%xmm8
    85a3:	45 0f 57 c9          	xorps  %xmm9,%xmm9
    85a7:	66 45 0f ef d2       	pxor   %xmm10,%xmm10
    85ac:	45 0f 57 db          	xorps  %xmm11,%xmm11
    85b0:	45 0f 57 e4          	xorps  %xmm12,%xmm12
    85b4:	45 0f 57 ed          	xorps  %xmm13,%xmm13
    85b8:	45 0f 57 f6          	xorps  %xmm14,%xmm14
    85bc:	45 0f 57 ff          	xorps  %xmm15,%xmm15
    85c0:	62 a1 7c 00 57 c0    	vxorps %xmm16,%xmm16,%xmm16
    85c6:	62 a1 74 00 57 c9    	vxorps %xmm17,%xmm17,%xmm17
    85cc:	62 a1 6c 00 57 d2    	vxorps %xmm18,%xmm18,%xmm18
    85d2:	62 a1 64 00 57 db    	vxorps %xmm19,%xmm19,%xmm19

00000000000085d8 <aesencfma128_loop>:
    85d8:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    85dd:	c4 e2 69 98 d6       	vfmadd132ps %xmm6,%xmm2,%xmm2
    85e2:	c4 e2 61 98 de       	vfmadd132ps %xmm6,%xmm3,%xmm3
    85e7:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    85ec:	c4 e2 41 98 fe       	vfmadd132ps %xmm6,%xmm7,%xmm7
    85f1:	c4 62 39 98 c6       	vfmadd132ps %xmm6,%xmm8,%xmm8
    85f6:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    85fc:	c4 62 21 98 de       	vfmadd132ps %xmm6,%xmm11,%xmm11
    8601:	c4 62 19 98 e6       	vfmadd132ps %xmm6,%xmm12,%xmm12
    8606:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    860b:	c4 62 09 98 f6       	vfmadd132ps %xmm6,%xmm14,%xmm14
    8610:	c4 62 01 98 fe       	vfmadd132ps %xmm6,%xmm15,%xmm15
    8615:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    861b:	62 e2 75 00 98 ce    	vfmadd132ps %xmm6,%xmm17,%xmm17
    8621:	62 e2 6d 00 98 d6    	vfmadd132ps %xmm6,%xmm18,%xmm18
    8627:	4c 29 cf             	sub    %r9,%rdi
    862a:	75 ac                	jne    85d8 <aesencfma128_loop>
    862c:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8631:	41 59                	pop    %r9
    8633:	c3                   	retq   

0000000000008634 <aesencfadd128>:
    8634:	41 51                	push   %r9
    8636:	49 c7 c1 0f 00 00 00 	mov    $0xf,%r9
    863d:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8642:	c5 fc 77             	vzeroall 
    8645:	66 0f ef c0          	pxor   %xmm0,%xmm0
    8649:	66 0f ef c9          	pxor   %xmm1,%xmm1
    864d:	0f 57 d2             	xorps  %xmm2,%xmm2
    8650:	0f 57 db             	xorps  %xmm3,%xmm3
    8653:	0f 57 e4             	xorps  %xmm4,%xmm4
    8656:	66 0f ef ed          	pxor   %xmm5,%xmm5
    865a:	0f 57 f6             	xorps  %xmm6,%xmm6
    865d:	0f 57 ff             	xorps  %xmm7,%xmm7
    8660:	45 0f 57 c0          	xorps  %xmm8,%xmm8
    8664:	45 0f 57 c9          	xorps  %xmm9,%xmm9
    8668:	66 45 0f ef d2       	pxor   %xmm10,%xmm10
    866d:	45 0f 57 db          	xorps  %xmm11,%xmm11
    8671:	45 0f 57 e4          	xorps  %xmm12,%xmm12
    8675:	45 0f 57 ed          	xorps  %xmm13,%xmm13
    8679:	45 0f 57 f6          	xorps  %xmm14,%xmm14
    867d:	45 0f 57 ff          	xorps  %xmm15,%xmm15
    8681:	62 a1 7c 00 57 c0    	vxorps %xmm16,%xmm16,%xmm16
    8687:	62 a1 74 00 57 c9    	vxorps %xmm17,%xmm17,%xmm17
    868d:	62 a1 6c 00 57 d2    	vxorps %xmm18,%xmm18,%xmm18
    8693:	62 a1 64 00 57 db    	vxorps %xmm19,%xmm19,%xmm19

0000000000008699 <aesencfadd128_loop>:
    8699:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    869e:	c5 e8 58 d6          	vaddps %xmm6,%xmm2,%xmm2
    86a2:	c5 e0 58 de          	vaddps %xmm6,%xmm3,%xmm3
    86a6:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    86ab:	c5 c0 58 fe          	vaddps %xmm6,%xmm7,%xmm7
    86af:	c5 38 58 c6          	vaddps %xmm6,%xmm8,%xmm8
    86b3:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    86b9:	c5 20 58 de          	vaddps %xmm6,%xmm11,%xmm11
    86bd:	c5 18 58 e6          	vaddps %xmm6,%xmm12,%xmm12
    86c1:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    86c6:	c5 08 58 f6          	vaddps %xmm6,%xmm14,%xmm14
    86ca:	c5 00 58 fe          	vaddps %xmm6,%xmm15,%xmm15
    86ce:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    86d4:	62 e1 74 00 58 ce    	vaddps %xmm6,%xmm17,%xmm17
    86da:	62 e1 6c 00 58 d6    	vaddps %xmm6,%xmm18,%xmm18
    86e0:	4c 29 cf             	sub    %r9,%rdi
    86e3:	7f b4                	jg     8699 <aesencfadd128_loop>
    86e5:	66 48 0f 7e c8       	movq   %xmm1,%rax
    86ea:	41 59                	pop    %r9
    86ec:	c3                   	retq   

00000000000086ed <aesencmul128>:
    86ed:	41 51                	push   %r9
    86ef:	49 c7 c1 0f 00 00 00 	mov    $0xf,%r9
    86f6:	c5 fc 77             	vzeroall 
    86f9:	66 49 0f 6e f1       	movq   %r9,%xmm6
    86fe:	66 0f ef c0          	pxor   %xmm0,%xmm0
    8702:	66 0f ef ed          	pxor   %xmm5,%xmm5
    8706:	66 45 0f ef d2       	pxor   %xmm10,%xmm10
    870b:	0f 57 c9             	xorps  %xmm1,%xmm1
    870e:	0f 57 d2             	xorps  %xmm2,%xmm2
    8711:	0f 57 db             	xorps  %xmm3,%xmm3
    8714:	0f 57 e4             	xorps  %xmm4,%xmm4
    8717:	0f 57 ff             	xorps  %xmm7,%xmm7
    871a:	45 0f 57 c0          	xorps  %xmm8,%xmm8
    871e:	45 0f 57 db          	xorps  %xmm11,%xmm11
    8722:	45 0f 57 e4          	xorps  %xmm12,%xmm12
    8726:	45 0f 57 f6          	xorps  %xmm14,%xmm14
    872a:	45 0f 57 ff          	xorps  %xmm15,%xmm15

000000000000872e <aesencmul128_loop>:
    872e:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    8733:	66 0f d5 d6          	pmullw %xmm6,%xmm2
    8737:	66 0f d5 de          	pmullw %xmm6,%xmm3
    873b:	66 0f 38 dc e8       	aesenc %xmm0,%xmm5
    8740:	66 0f d5 fe          	pmullw %xmm6,%xmm7
    8744:	66 44 0f d5 c6       	pmullw %xmm6,%xmm8
    8749:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    874f:	66 44 0f d5 de       	pmullw %xmm6,%xmm11
    8754:	66 44 0f d5 e6       	pmullw %xmm6,%xmm12
    8759:	66 0f 38 dc c8       	aesenc %xmm0,%xmm1
    875e:	66 0f d5 e6          	pmullw %xmm6,%xmm4
    8762:	66 0f d5 f6          	pmullw %xmm6,%xmm6
    8766:	66 44 0f 38 dc d0    	aesenc %xmm0,%xmm10
    876c:	66 44 0f d5 ee       	pmullw %xmm6,%xmm13
    8771:	66 44 0f d5 f6       	pmullw %xmm6,%xmm14
    8776:	4c 29 cf             	sub    %r9,%rdi
    8779:	7f b3                	jg     872e <aesencmul128_loop>
    877b:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8780:	41 59                	pop    %r9
    8782:	c3                   	retq   

0000000000008783 <aesdec128>:
    8783:	41 51                	push   %r9
    8785:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    878c:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8791:	c5 fc 77             	vzeroall 
    8794:	66 0f ef c0          	pxor   %xmm0,%xmm0
    8798:	66 0f ef c9          	pxor   %xmm1,%xmm1
    879c:	66 0f ef d2          	pxor   %xmm2,%xmm2
    87a0:	66 0f ef db          	pxor   %xmm3,%xmm3
    87a4:	66 0f ef e4          	pxor   %xmm4,%xmm4
    87a8:	66 0f ef ed          	pxor   %xmm5,%xmm5

00000000000087ac <aesdec128_loop>:
    87ac:	66 0f 38 de c8       	aesdec %xmm0,%xmm1
    87b1:	66 0f 38 de d0       	aesdec %xmm0,%xmm2
    87b6:	66 0f 38 de d8       	aesdec %xmm0,%xmm3
    87bb:	66 0f 38 de e0       	aesdec %xmm0,%xmm4
    87c0:	66 0f 38 de e8       	aesdec %xmm0,%xmm5
    87c5:	66 0f 38 de c8       	aesdec %xmm0,%xmm1
    87ca:	66 0f 38 de d0       	aesdec %xmm0,%xmm2
    87cf:	66 0f 38 de d8       	aesdec %xmm0,%xmm3
    87d4:	66 0f 38 de e0       	aesdec %xmm0,%xmm4
    87d9:	66 0f 38 de e8       	aesdec %xmm0,%xmm5
    87de:	66 0f 38 de c8       	aesdec %xmm0,%xmm1
    87e3:	66 0f 38 de d0       	aesdec %xmm0,%xmm2
    87e8:	66 0f 38 de d8       	aesdec %xmm0,%xmm3
    87ed:	66 0f 38 de e0       	aesdec %xmm0,%xmm4
    87f2:	66 0f 38 de e8       	aesdec %xmm0,%xmm5
    87f7:	66 0f 38 de c8       	aesdec %xmm0,%xmm1
    87fc:	66 0f 38 de d0       	aesdec %xmm0,%xmm2
    8801:	66 0f 38 de d8       	aesdec %xmm0,%xmm3
    8806:	66 0f 38 de e0       	aesdec %xmm0,%xmm4
    880b:	66 0f 38 de e8       	aesdec %xmm0,%xmm5
    8810:	4c 29 cf             	sub    %r9,%rdi
    8813:	75 97                	jne    87ac <aesdec128_loop>
    8815:	66 48 0f 7e c8       	movq   %xmm1,%rax
    881a:	41 59                	pop    %r9
    881c:	c3                   	retq   

000000000000881d <mul128int>:
    881d:	41 51                	push   %r9
    881f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8826:	66 49 0f 6e c9       	movq   %r9,%xmm1

000000000000882b <mul128int_loop>:
    882b:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    8830:	66 0f 38 40 c9       	pmulld %xmm1,%xmm1
    8835:	66 0f 38 40 d2       	pmulld %xmm2,%xmm2
    883a:	66 0f 38 40 db       	pmulld %xmm3,%xmm3
    883f:	66 0f 38 40 e4       	pmulld %xmm4,%xmm4
    8844:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    8849:	66 0f 38 40 c9       	pmulld %xmm1,%xmm1
    884e:	66 0f 38 40 d2       	pmulld %xmm2,%xmm2
    8853:	66 0f 38 40 db       	pmulld %xmm3,%xmm3
    8858:	66 0f 38 40 e4       	pmulld %xmm4,%xmm4
    885d:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    8862:	66 0f 38 40 c9       	pmulld %xmm1,%xmm1
    8867:	66 0f 38 40 d2       	pmulld %xmm2,%xmm2
    886c:	66 0f 38 40 db       	pmulld %xmm3,%xmm3
    8871:	66 0f 38 40 e4       	pmulld %xmm4,%xmm4
    8876:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    887b:	66 0f 38 40 c9       	pmulld %xmm1,%xmm1
    8880:	66 0f 38 40 d2       	pmulld %xmm2,%xmm2
    8885:	66 0f 38 40 db       	pmulld %xmm3,%xmm3
    888a:	66 0f 38 40 e4       	pmulld %xmm4,%xmm4
    888f:	4c 29 cf             	sub    %r9,%rdi
    8892:	75 97                	jne    882b <mul128int_loop>
    8894:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8899:	41 59                	pop    %r9
    889b:	c3                   	retq   

000000000000889c <latmul128int>:
    889c:	41 51                	push   %r9
    889e:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    88a5:	66 49 0f 6e c9       	movq   %r9,%xmm1

00000000000088aa <latmul128int_loop>:
    88aa:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88af:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88b4:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88b9:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88be:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88c3:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88c8:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88cd:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88d2:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88d7:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88dc:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88e1:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88e6:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88eb:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88f0:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88f5:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88fa:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    88ff:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    8904:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    8909:	66 0f 38 40 c0       	pmulld %xmm0,%xmm0
    890e:	4c 29 cf             	sub    %r9,%rdi
    8911:	75 97                	jne    88aa <latmul128int_loop>
    8913:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8918:	41 59                	pop    %r9
    891a:	c3                   	retq   

000000000000891b <mixaddmul128int>:
    891b:	41 51                	push   %r9
    891d:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8924:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8929:	66 0f 6f c8          	movdqa %xmm0,%xmm1
    892d:	66 0f 6f d0          	movdqa %xmm0,%xmm2
    8931:	66 0f 6f d8          	movdqa %xmm0,%xmm3
    8935:	66 0f 6f e0          	movdqa %xmm0,%xmm4
    8939:	66 0f 6f e8          	movdqa %xmm0,%xmm5
    893d:	66 0f 6f f0          	movdqa %xmm0,%xmm6
    8941:	66 0f 6f f8          	movdqa %xmm0,%xmm7
    8945:	66 44 0f 6f c0       	movdqa %xmm0,%xmm8
    894a:	66 44 0f 6f c8       	movdqa %xmm0,%xmm9
    894f:	66 44 0f 6f d0       	movdqa %xmm0,%xmm10

0000000000008954 <mixaddmul128int_loop>:
    8954:	66 0f 38 40 c8       	pmulld %xmm0,%xmm1
    8959:	66 0f fe d0          	paddd  %xmm0,%xmm2
    895d:	66 0f 38 40 d8       	pmulld %xmm0,%xmm3
    8962:	66 0f fe e0          	paddd  %xmm0,%xmm4
    8966:	66 0f 38 40 e8       	pmulld %xmm0,%xmm5
    896b:	66 0f fe f0          	paddd  %xmm0,%xmm6
    896f:	66 0f 38 40 f8       	pmulld %xmm0,%xmm7
    8974:	66 44 0f fe c0       	paddd  %xmm0,%xmm8
    8979:	66 44 0f 38 40 c8    	pmulld %xmm0,%xmm9
    897f:	66 44 0f fe d0       	paddd  %xmm0,%xmm10
    8984:	66 0f 38 40 c8       	pmulld %xmm0,%xmm1
    8989:	66 0f fe d0          	paddd  %xmm0,%xmm2
    898d:	66 0f 38 40 d8       	pmulld %xmm0,%xmm3
    8992:	66 0f fe e0          	paddd  %xmm0,%xmm4
    8996:	66 0f 38 40 e8       	pmulld %xmm0,%xmm5
    899b:	66 0f fe f0          	paddd  %xmm0,%xmm6
    899f:	66 0f 38 40 f8       	pmulld %xmm0,%xmm7
    89a4:	66 44 0f fe c0       	paddd  %xmm0,%xmm8
    89a9:	66 44 0f 38 40 c8    	pmulld %xmm0,%xmm9
    89af:	66 44 0f fe d0       	paddd  %xmm0,%xmm10
    89b4:	4c 29 cf             	sub    %r9,%rdi
    89b7:	75 9b                	jne    8954 <mixaddmul128int_loop>
    89b9:	66 48 0f 7e c8       	movq   %xmm1,%rax
    89be:	41 59                	pop    %r9
    89c0:	c3                   	retq   

00000000000089c1 <latadd256fp>:
    89c1:	41 51                	push   %r9
    89c3:	41 50                	push   %r8
    89c5:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    89cc:	66 49 0f 6e c9       	movq   %r9,%xmm1
    89d1:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    89d6:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6

00000000000089db <latadd256fp_loop>:
    89db:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89df:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89e3:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89e7:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89eb:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89ef:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89f3:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89f7:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89fb:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    89ff:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a03:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a07:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a0b:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a0f:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a13:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a17:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a1b:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a1f:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a23:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a27:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    8a2b:	4c 29 cf             	sub    %r9,%rdi
    8a2e:	75 ab                	jne    89db <latadd256fp_loop>
    8a30:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8a35:	c5 f8 77             	vzeroupper 
    8a38:	41 58                	pop    %r8
    8a3a:	41 59                	pop    %r9
    8a3c:	c3                   	retq   

0000000000008a3d <mul256fp>:
    8a3d:	41 51                	push   %r9
    8a3f:	41 50                	push   %r8
    8a41:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8a48:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0
    8a4d:	c4 e2 7d 18 f0       	vbroadcastss %xmm0,%ymm6
    8a52:	c5 fd 6f c8          	vmovdqa %ymm0,%ymm1
    8a56:	c5 fd 6f d0          	vmovdqa %ymm0,%ymm2
    8a5a:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    8a5e:	c5 fd 6f e0          	vmovdqa %ymm0,%ymm4
    8a62:	c5 fd 6f e8          	vmovdqa %ymm0,%ymm5
    8a66:	c5 fd 6f f0          	vmovdqa %ymm0,%ymm6
    8a6a:	c5 fd 6f f8          	vmovdqa %ymm0,%ymm7
    8a6e:	c5 7d 6f c0          	vmovdqa %ymm0,%ymm8
    8a72:	c5 7d 6f c8          	vmovdqa %ymm0,%ymm9
    8a76:	c5 7d 6f d0          	vmovdqa %ymm0,%ymm10

0000000000008a7a <mul256fp_loop>:
    8a7a:	c5 f4 59 c8          	vmulps %ymm0,%ymm1,%ymm1
    8a7e:	c5 ec 59 d0          	vmulps %ymm0,%ymm2,%ymm2
    8a82:	c5 e4 59 d8          	vmulps %ymm0,%ymm3,%ymm3
    8a86:	c5 dc 59 e0          	vmulps %ymm0,%ymm4,%ymm4
    8a8a:	c5 d4 59 e8          	vmulps %ymm0,%ymm5,%ymm5
    8a8e:	c5 cc 59 f0          	vmulps %ymm0,%ymm6,%ymm6
    8a92:	c5 c4 59 f8          	vmulps %ymm0,%ymm7,%ymm7
    8a96:	c5 3c 59 c0          	vmulps %ymm0,%ymm8,%ymm8
    8a9a:	c5 34 59 c8          	vmulps %ymm0,%ymm9,%ymm9
    8a9e:	c5 2c 59 d0          	vmulps %ymm0,%ymm10,%ymm10
    8aa2:	c5 f4 59 c8          	vmulps %ymm0,%ymm1,%ymm1
    8aa6:	c5 ec 59 d0          	vmulps %ymm0,%ymm2,%ymm2
    8aaa:	c5 e4 59 d8          	vmulps %ymm0,%ymm3,%ymm3
    8aae:	c5 dc 59 e0          	vmulps %ymm0,%ymm4,%ymm4
    8ab2:	c5 d4 59 e8          	vmulps %ymm0,%ymm5,%ymm5
    8ab6:	c5 cc 59 f0          	vmulps %ymm0,%ymm6,%ymm6
    8aba:	c5 c4 59 f8          	vmulps %ymm0,%ymm7,%ymm7
    8abe:	c5 3c 59 c0          	vmulps %ymm0,%ymm8,%ymm8
    8ac2:	c5 34 59 c8          	vmulps %ymm0,%ymm9,%ymm9
    8ac6:	c5 2c 59 d0          	vmulps %ymm0,%ymm10,%ymm10
    8aca:	4c 29 cf             	sub    %r9,%rdi
    8acd:	75 ab                	jne    8a7a <mul256fp_loop>
    8acf:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8ad4:	c5 f8 77             	vzeroupper 
    8ad7:	41 58                	pop    %r8
    8ad9:	41 59                	pop    %r9
    8adb:	c3                   	retq   

0000000000008adc <add256fp>:
    8adc:	41 51                	push   %r9
    8ade:	41 50                	push   %r8
    8ae0:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8ae7:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0
    8aec:	c4 e2 7d 18 f0       	vbroadcastss %xmm0,%ymm6
    8af1:	c5 fd 6f c8          	vmovdqa %ymm0,%ymm1
    8af5:	c5 fd 6f d0          	vmovdqa %ymm0,%ymm2
    8af9:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    8afd:	c5 fd 6f e0          	vmovdqa %ymm0,%ymm4
    8b01:	c5 fd 6f e8          	vmovdqa %ymm0,%ymm5
    8b05:	c5 fd 6f f0          	vmovdqa %ymm0,%ymm6
    8b09:	c5 fd 6f f8          	vmovdqa %ymm0,%ymm7
    8b0d:	c5 7d 6f c0          	vmovdqa %ymm0,%ymm8
    8b11:	c5 7d 6f c8          	vmovdqa %ymm0,%ymm9
    8b15:	c5 7d 6f d0          	vmovdqa %ymm0,%ymm10

0000000000008b19 <add256fp_loop>:
    8b19:	c5 f4 58 c8          	vaddps %ymm0,%ymm1,%ymm1
    8b1d:	c5 ec 58 d0          	vaddps %ymm0,%ymm2,%ymm2
    8b21:	c5 e4 58 d8          	vaddps %ymm0,%ymm3,%ymm3
    8b25:	c5 dc 58 e0          	vaddps %ymm0,%ymm4,%ymm4
    8b29:	c5 d4 58 e8          	vaddps %ymm0,%ymm5,%ymm5
    8b2d:	c5 cc 58 f0          	vaddps %ymm0,%ymm6,%ymm6
    8b31:	c5 c4 58 f8          	vaddps %ymm0,%ymm7,%ymm7
    8b35:	c5 3c 58 c0          	vaddps %ymm0,%ymm8,%ymm8
    8b39:	c5 34 58 c8          	vaddps %ymm0,%ymm9,%ymm9
    8b3d:	c5 2c 58 d0          	vaddps %ymm0,%ymm10,%ymm10
    8b41:	c5 f4 58 c8          	vaddps %ymm0,%ymm1,%ymm1
    8b45:	c5 ec 58 d0          	vaddps %ymm0,%ymm2,%ymm2
    8b49:	c5 e4 58 d8          	vaddps %ymm0,%ymm3,%ymm3
    8b4d:	c5 dc 58 e0          	vaddps %ymm0,%ymm4,%ymm4
    8b51:	c5 d4 58 e8          	vaddps %ymm0,%ymm5,%ymm5
    8b55:	c5 cc 58 f0          	vaddps %ymm0,%ymm6,%ymm6
    8b59:	c5 c4 58 f8          	vaddps %ymm0,%ymm7,%ymm7
    8b5d:	c5 3c 58 c0          	vaddps %ymm0,%ymm8,%ymm8
    8b61:	c5 34 58 c8          	vaddps %ymm0,%ymm9,%ymm9
    8b65:	c5 2c 58 d0          	vaddps %ymm0,%ymm10,%ymm10
    8b69:	4c 29 cf             	sub    %r9,%rdi
    8b6c:	75 ab                	jne    8b19 <add256fp_loop>
    8b6e:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8b73:	c5 f8 77             	vzeroupper 
    8b76:	41 58                	pop    %r8
    8b78:	41 59                	pop    %r9
    8b7a:	c3                   	retq   

0000000000008b7b <latmul256fp>:
    8b7b:	41 51                	push   %r9
    8b7d:	41 50                	push   %r8
    8b7f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8b86:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8b8b:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8b90:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6

0000000000008b95 <latmul256fp_loop>:
    8b95:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8b99:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8b9d:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8ba1:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8ba5:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8ba9:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bad:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bb1:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bb5:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bb9:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bbd:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bc1:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bc5:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bc9:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bcd:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bd1:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bd5:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bd9:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8bdd:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8be1:	c5 cc 59 f6          	vmulps %ymm6,%ymm6,%ymm6
    8be5:	4c 29 cf             	sub    %r9,%rdi
    8be8:	75 ab                	jne    8b95 <latmul256fp_loop>
    8bea:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8bef:	c5 f8 77             	vzeroupper 
    8bf2:	41 58                	pop    %r8
    8bf4:	41 59                	pop    %r9
    8bf6:	c3                   	retq   

0000000000008bf7 <fma512>:
    8bf7:	41 51                	push   %r9
    8bf9:	41 50                	push   %r8
    8bfb:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8c02:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8c07:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8c0c:	62 f2 7d 48 18 f6    	vbroadcastss %xmm6,%zmm6
    8c12:	62 f1 7c 48 10 ee    	vmovups %zmm6,%zmm5
    8c18:	62 f1 7c 48 10 fe    	vmovups %zmm6,%zmm7
    8c1e:	62 71 7c 48 10 c6    	vmovups %zmm6,%zmm8
    8c24:	62 71 7c 48 10 ce    	vmovups %zmm6,%zmm9
    8c2a:	62 71 7c 48 10 d6    	vmovups %zmm6,%zmm10
    8c30:	62 71 7c 48 10 de    	vmovups %zmm6,%zmm11
    8c36:	62 71 7c 48 10 e6    	vmovups %zmm6,%zmm12
    8c3c:	62 71 7c 48 10 ee    	vmovups %zmm6,%zmm13
    8c42:	62 71 7c 48 10 f6    	vmovups %zmm6,%zmm14
    8c48:	62 71 7c 48 10 fe    	vmovups %zmm6,%zmm15

0000000000008c4e <fma512_loop>:
    8c4e:	62 f2 55 48 98 ee    	vfmadd132ps %zmm6,%zmm5,%zmm5
    8c54:	62 f2 45 48 98 fe    	vfmadd132ps %zmm6,%zmm7,%zmm7
    8c5a:	62 72 3d 48 98 c6    	vfmadd132ps %zmm6,%zmm8,%zmm8
    8c60:	62 72 35 48 98 ce    	vfmadd132ps %zmm6,%zmm9,%zmm9
    8c66:	62 72 2d 48 98 d6    	vfmadd132ps %zmm6,%zmm10,%zmm10
    8c6c:	62 72 25 48 98 de    	vfmadd132ps %zmm6,%zmm11,%zmm11
    8c72:	62 72 1d 48 98 e6    	vfmadd132ps %zmm6,%zmm12,%zmm12
    8c78:	62 72 15 48 98 ee    	vfmadd132ps %zmm6,%zmm13,%zmm13
    8c7e:	62 72 0d 48 98 f6    	vfmadd132ps %zmm6,%zmm14,%zmm14
    8c84:	62 72 05 48 98 fe    	vfmadd132ps %zmm6,%zmm15,%zmm15
    8c8a:	62 f2 55 48 98 ee    	vfmadd132ps %zmm6,%zmm5,%zmm5
    8c90:	62 f2 45 48 98 fe    	vfmadd132ps %zmm6,%zmm7,%zmm7
    8c96:	62 72 3d 48 98 c6    	vfmadd132ps %zmm6,%zmm8,%zmm8
    8c9c:	62 72 35 48 98 ce    	vfmadd132ps %zmm6,%zmm9,%zmm9
    8ca2:	62 72 2d 48 98 d6    	vfmadd132ps %zmm6,%zmm10,%zmm10
    8ca8:	62 72 25 48 98 de    	vfmadd132ps %zmm6,%zmm11,%zmm11
    8cae:	62 72 1d 48 98 e6    	vfmadd132ps %zmm6,%zmm12,%zmm12
    8cb4:	62 72 15 48 98 ee    	vfmadd132ps %zmm6,%zmm13,%zmm13
    8cba:	62 72 0d 48 98 f6    	vfmadd132ps %zmm6,%zmm14,%zmm14
    8cc0:	62 72 05 48 98 fe    	vfmadd132ps %zmm6,%zmm15,%zmm15
    8cc6:	4c 29 cf             	sub    %r9,%rdi
    8cc9:	75 83                	jne    8c4e <fma512_loop>
    8ccb:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8cd0:	c5 f8 77             	vzeroupper 
    8cd3:	41 58                	pop    %r8
    8cd5:	41 59                	pop    %r9
    8cd7:	c3                   	retq   

0000000000008cd8 <mixfma256fma512>:
    8cd8:	41 51                	push   %r9
    8cda:	41 50                	push   %r8
    8cdc:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8ce3:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8ce8:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8ced:	62 f2 7d 48 18 f6    	vbroadcastss %xmm6,%zmm6
    8cf3:	62 f1 7c 48 10 ee    	vmovups %zmm6,%zmm5
    8cf9:	62 f1 7c 48 10 fe    	vmovups %zmm6,%zmm7
    8cff:	62 71 7c 48 10 c6    	vmovups %zmm6,%zmm8
    8d05:	62 71 7c 48 10 ce    	vmovups %zmm6,%zmm9
    8d0b:	62 71 7c 48 10 d6    	vmovups %zmm6,%zmm10
    8d11:	62 71 7c 48 10 de    	vmovups %zmm6,%zmm11
    8d17:	62 71 7c 48 10 e6    	vmovups %zmm6,%zmm12
    8d1d:	62 71 7c 48 10 ee    	vmovups %zmm6,%zmm13
    8d23:	62 71 7c 48 10 f6    	vmovups %zmm6,%zmm14
    8d29:	62 71 7c 48 10 fe    	vmovups %zmm6,%zmm15

0000000000008d2f <mixfma256fma512_loop>:
    8d2f:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8d34:	62 f2 45 48 98 fe    	vfmadd132ps %zmm6,%zmm7,%zmm7
    8d3a:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8d3f:	62 72 35 48 98 ce    	vfmadd132ps %zmm6,%zmm9,%zmm9
    8d45:	c4 62 2d 98 d6       	vfmadd132ps %ymm6,%ymm10,%ymm10
    8d4a:	62 72 25 48 98 de    	vfmadd132ps %zmm6,%zmm11,%zmm11
    8d50:	c4 62 1d 98 e6       	vfmadd132ps %ymm6,%ymm12,%ymm12
    8d55:	62 72 15 48 98 ee    	vfmadd132ps %zmm6,%zmm13,%zmm13
    8d5b:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8d60:	62 72 05 48 98 fe    	vfmadd132ps %zmm6,%zmm15,%zmm15
    8d66:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8d6b:	62 f2 45 48 98 fe    	vfmadd132ps %zmm6,%zmm7,%zmm7
    8d71:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8d76:	62 72 35 48 98 ce    	vfmadd132ps %zmm6,%zmm9,%zmm9
    8d7c:	c4 62 2d 98 d6       	vfmadd132ps %ymm6,%ymm10,%ymm10
    8d81:	62 72 25 48 98 de    	vfmadd132ps %zmm6,%zmm11,%zmm11
    8d87:	c4 62 1d 98 e6       	vfmadd132ps %ymm6,%ymm12,%ymm12
    8d8c:	62 72 15 48 98 ee    	vfmadd132ps %zmm6,%zmm13,%zmm13
    8d92:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8d97:	62 72 05 48 98 fe    	vfmadd132ps %zmm6,%zmm15,%zmm15
    8d9d:	4c 29 cf             	sub    %r9,%rdi
    8da0:	75 8d                	jne    8d2f <mixfma256fma512_loop>
    8da2:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8da7:	c5 f8 77             	vzeroupper 
    8daa:	41 58                	pop    %r8
    8dac:	41 59                	pop    %r9
    8dae:	c3                   	retq   

0000000000008daf <fma256>:
    8daf:	41 51                	push   %r9
    8db1:	41 50                	push   %r8
    8db3:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8dba:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8dbf:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8dc4:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    8dc9:	c5 fc 10 ee          	vmovups %ymm6,%ymm5
    8dcd:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    8dd1:	c5 7c 10 c6          	vmovups %ymm6,%ymm8
    8dd5:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    8dd9:	c5 7c 10 d6          	vmovups %ymm6,%ymm10
    8ddd:	c5 7c 10 de          	vmovups %ymm6,%ymm11
    8de1:	c5 7c 10 e6          	vmovups %ymm6,%ymm12
    8de5:	c5 7c 10 ee          	vmovups %ymm6,%ymm13
    8de9:	c5 7c 10 f6          	vmovups %ymm6,%ymm14
    8ded:	c5 7c 10 fe          	vmovups %ymm6,%ymm15

0000000000008df1 <fma256_loop>:
    8df1:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8df6:	c4 e2 45 98 fe       	vfmadd132ps %ymm6,%ymm7,%ymm7
    8dfb:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8e00:	c4 62 35 98 ce       	vfmadd132ps %ymm6,%ymm9,%ymm9
    8e05:	c4 62 2d 98 d6       	vfmadd132ps %ymm6,%ymm10,%ymm10
    8e0a:	c4 62 25 98 de       	vfmadd132ps %ymm6,%ymm11,%ymm11
    8e0f:	c4 62 1d 98 e6       	vfmadd132ps %ymm6,%ymm12,%ymm12
    8e14:	c4 62 15 98 ee       	vfmadd132ps %ymm6,%ymm13,%ymm13
    8e19:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8e1e:	c4 62 05 98 fe       	vfmadd132ps %ymm6,%ymm15,%ymm15
    8e23:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8e28:	c4 e2 45 98 fe       	vfmadd132ps %ymm6,%ymm7,%ymm7
    8e2d:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8e32:	c4 62 35 98 ce       	vfmadd132ps %ymm6,%ymm9,%ymm9
    8e37:	c4 62 2d 98 d6       	vfmadd132ps %ymm6,%ymm10,%ymm10
    8e3c:	c4 62 25 98 de       	vfmadd132ps %ymm6,%ymm11,%ymm11
    8e41:	c4 62 1d 98 e6       	vfmadd132ps %ymm6,%ymm12,%ymm12
    8e46:	c4 62 15 98 ee       	vfmadd132ps %ymm6,%ymm13,%ymm13
    8e4b:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8e50:	c4 62 05 98 fe       	vfmadd132ps %ymm6,%ymm15,%ymm15
    8e55:	4c 29 cf             	sub    %r9,%rdi
    8e58:	75 97                	jne    8df1 <fma256_loop>
    8e5a:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8e5f:	c5 f8 77             	vzeroupper 
    8e62:	41 58                	pop    %r8
    8e64:	41 59                	pop    %r9
    8e66:	c3                   	retq   

0000000000008e67 <fma128>:
    8e67:	41 51                	push   %r9
    8e69:	41 50                	push   %r8
    8e6b:	c5 f8 77             	vzeroupper 
    8e6e:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    8e75:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8e7a:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8e7f:	c4 e2 79 18 f6       	vbroadcastss %xmm6,%xmm6
    8e84:	c5 f8 10 ee          	vmovups %xmm6,%xmm5
    8e88:	c5 f8 10 fe          	vmovups %xmm6,%xmm7
    8e8c:	c5 78 10 c6          	vmovups %xmm6,%xmm8
    8e90:	c5 78 10 ce          	vmovups %xmm6,%xmm9
    8e94:	c5 78 10 d6          	vmovups %xmm6,%xmm10
    8e98:	c5 78 10 de          	vmovups %xmm6,%xmm11
    8e9c:	c5 78 10 e6          	vmovups %xmm6,%xmm12
    8ea0:	c5 78 10 ee          	vmovups %xmm6,%xmm13
    8ea4:	c5 78 10 f6          	vmovups %xmm6,%xmm14
    8ea8:	c5 78 10 fe          	vmovups %xmm6,%xmm15

0000000000008eac <fma128_loop>:
    8eac:	c4 e2 51 98 ee       	vfmadd132ps %xmm6,%xmm5,%xmm5
    8eb1:	c4 e2 41 98 fe       	vfmadd132ps %xmm6,%xmm7,%xmm7
    8eb6:	c4 62 39 98 c6       	vfmadd132ps %xmm6,%xmm8,%xmm8
    8ebb:	c4 62 31 98 ce       	vfmadd132ps %xmm6,%xmm9,%xmm9
    8ec0:	c4 62 29 98 d6       	vfmadd132ps %xmm6,%xmm10,%xmm10
    8ec5:	c4 62 21 98 de       	vfmadd132ps %xmm6,%xmm11,%xmm11
    8eca:	c4 62 19 98 e6       	vfmadd132ps %xmm6,%xmm12,%xmm12
    8ecf:	c4 62 11 98 ee       	vfmadd132ps %xmm6,%xmm13,%xmm13
    8ed4:	c4 62 09 98 f6       	vfmadd132ps %xmm6,%xmm14,%xmm14
    8ed9:	c4 62 01 98 fe       	vfmadd132ps %xmm6,%xmm15,%xmm15
    8ede:	c4 e2 51 98 ee       	vfmadd132ps %xmm6,%xmm5,%xmm5
    8ee3:	c4 e2 41 98 fe       	vfmadd132ps %xmm6,%xmm7,%xmm7
    8ee8:	c4 62 39 98 c6       	vfmadd132ps %xmm6,%xmm8,%xmm8
    8eed:	c4 62 31 98 ce       	vfmadd132ps %xmm6,%xmm9,%xmm9
    8ef2:	c4 62 29 98 d6       	vfmadd132ps %xmm6,%xmm10,%xmm10
    8ef7:	c4 62 21 98 de       	vfmadd132ps %xmm6,%xmm11,%xmm11
    8efc:	c4 62 19 98 e6       	vfmadd132ps %xmm6,%xmm12,%xmm12
    8f01:	c4 62 11 98 ee       	vfmadd132ps %xmm6,%xmm13,%xmm13
    8f06:	c4 62 09 98 f6       	vfmadd132ps %xmm6,%xmm14,%xmm14
    8f0b:	c4 62 01 98 fe       	vfmadd132ps %xmm6,%xmm15,%xmm15
    8f10:	4c 29 cf             	sub    %r9,%rdi
    8f13:	75 97                	jne    8eac <fma128_loop>
    8f15:	66 48 0f 7e c8       	movq   %xmm1,%rax
    8f1a:	c5 f8 77             	vzeroupper 
    8f1d:	41 58                	pop    %r8
    8f1f:	41 59                	pop    %r9
    8f21:	c3                   	retq   

0000000000008f22 <mixfmafadd256>:
    8f22:	41 51                	push   %r9
    8f24:	41 50                	push   %r8
    8f26:	49 c7 c1 1e 00 00 00 	mov    $0x1e,%r9
    8f2d:	66 49 0f 6e c9       	movq   %r9,%xmm1
    8f32:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    8f37:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    8f3c:	c5 fc 10 c6          	vmovups %ymm6,%ymm0
    8f40:	c5 fc 10 ce          	vmovups %ymm6,%ymm1
    8f44:	c5 fc 10 d6          	vmovups %ymm6,%ymm2
    8f48:	c5 fc 10 de          	vmovups %ymm6,%ymm3
    8f4c:	c5 fc 10 e6          	vmovups %ymm6,%ymm4
    8f50:	c5 fc 10 ee          	vmovups %ymm6,%ymm5
    8f54:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    8f58:	c5 7c 10 c6          	vmovups %ymm6,%ymm8
    8f5c:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    8f60:	c5 7c 10 d6          	vmovups %ymm6,%ymm10
    8f64:	c5 7c 10 de          	vmovups %ymm6,%ymm11
    8f68:	c5 7c 10 e6          	vmovups %ymm6,%ymm12
    8f6c:	c5 7c 10 ee          	vmovups %ymm6,%ymm13
    8f70:	c5 7c 10 f6          	vmovups %ymm6,%ymm14
    8f74:	c5 7c 10 fe          	vmovups %ymm6,%ymm15

0000000000008f78 <mixfmafadd256_loop>:
    8f78:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8f7d:	c4 e2 45 98 fe       	vfmadd132ps %ymm6,%ymm7,%ymm7
    8f82:	c4 41 54 58 da       	vaddps %ymm10,%ymm5,%ymm11
    8f87:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8f8c:	c4 62 35 98 ce       	vfmadd132ps %ymm6,%ymm9,%ymm9
    8f91:	c4 41 54 58 ec       	vaddps %ymm12,%ymm5,%ymm13
    8f96:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8f9b:	c4 62 05 98 fe       	vfmadd132ps %ymm6,%ymm15,%ymm15
    8fa0:	c4 41 4c 58 ec       	vaddps %ymm12,%ymm6,%ymm13
    8fa5:	c4 e2 7d 98 ce       	vfmadd132ps %ymm6,%ymm0,%ymm1
    8faa:	c4 e2 6d 98 de       	vfmadd132ps %ymm6,%ymm2,%ymm3
    8faf:	c5 d4 58 e6          	vaddps %ymm6,%ymm5,%ymm4
    8fb3:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8fb8:	c4 e2 45 98 fe       	vfmadd132ps %ymm6,%ymm7,%ymm7
    8fbd:	c4 41 4c 58 da       	vaddps %ymm10,%ymm6,%ymm11
    8fc2:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    8fc7:	c4 62 35 98 ce       	vfmadd132ps %ymm6,%ymm9,%ymm9
    8fcc:	c4 41 44 58 ec       	vaddps %ymm12,%ymm7,%ymm13
    8fd1:	c4 62 0d 98 f6       	vfmadd132ps %ymm6,%ymm14,%ymm14
    8fd6:	c4 62 05 98 fe       	vfmadd132ps %ymm6,%ymm15,%ymm15
    8fdb:	c4 41 54 58 ec       	vaddps %ymm12,%ymm5,%ymm13
    8fe0:	c4 e2 7d 98 ce       	vfmadd132ps %ymm6,%ymm0,%ymm1
    8fe5:	c4 e2 6d 98 de       	vfmadd132ps %ymm6,%ymm2,%ymm3
    8fea:	c5 d4 58 e6          	vaddps %ymm6,%ymm5,%ymm4
    8fee:	c4 e2 55 98 ee       	vfmadd132ps %ymm6,%ymm5,%ymm5
    8ff3:	c4 e2 45 98 fe       	vfmadd132ps %ymm6,%ymm7,%ymm7
    8ff8:	c4 41 4c 58 da       	vaddps %ymm10,%ymm6,%ymm11
    8ffd:	c4 62 3d 98 c6       	vfmadd132ps %ymm6,%ymm8,%ymm8
    9002:	c4 62 35 98 ce       	vfmadd132ps %ymm6,%ymm9,%ymm9
    9007:	c4 41 54 58 ec       	vaddps %ymm12,%ymm5,%ymm13
    900c:	4c 29 cf             	sub    %r9,%rdi
    900f:	0f 85 63 ff ff ff    	jne    8f78 <mixfmafadd256_loop>
    9015:	66 48 0f 7e c8       	movq   %xmm1,%rax
    901a:	c5 f8 77             	vzeroupper 
    901d:	41 58                	pop    %r8
    901f:	41 59                	pop    %r9
    9021:	c3                   	retq   

0000000000009022 <mixfmaadd512>:
    9022:	41 51                	push   %r9
    9024:	41 50                	push   %r8
    9026:	49 c7 c1 10 00 00 00 	mov    $0x10,%r9
    902d:	66 49 0f 6e c1       	movq   %r9,%xmm0
    9032:	62 f2 fd 48 59 c0    	vpbroadcastq %xmm0,%zmm0
    9038:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    903d:	62 f2 7d 48 18 c9    	vbroadcastss %xmm1,%zmm1
    9043:	62 f1 fd 48 6f d8    	vmovdqa64 %zmm0,%zmm3
    9049:	62 f1 fd 48 6f f0    	vmovdqa64 %zmm0,%zmm6
    904f:	62 71 fd 48 6f c8    	vmovdqa64 %zmm0,%zmm9
    9055:	62 71 fd 48 6f e0    	vmovdqa64 %zmm0,%zmm12
    905b:	62 71 fd 48 6f f8    	vmovdqa64 %zmm0,%zmm15
    9061:	62 f1 7c 48 28 d1    	vmovaps %zmm1,%zmm2
    9067:	62 f1 7c 48 28 e1    	vmovaps %zmm1,%zmm4
    906d:	62 f1 7c 48 28 e9    	vmovaps %zmm1,%zmm5
    9073:	62 f1 7c 48 28 f9    	vmovaps %zmm1,%zmm7
    9079:	62 71 7c 48 28 c1    	vmovaps %zmm1,%zmm8
    907f:	62 71 7c 48 28 d1    	vmovaps %zmm1,%zmm10
    9085:	62 71 7c 48 28 d9    	vmovaps %zmm1,%zmm11
    908b:	62 71 7c 48 28 e9    	vmovaps %zmm1,%zmm13
    9091:	62 71 7c 48 28 f1    	vmovaps %zmm1,%zmm14

0000000000009097 <mixfmaadd512_loop>:
    9097:	62 f1 85 48 d4 c0    	vpaddq %zmm0,%zmm15,%zmm0
    909d:	62 f2 75 48 98 c9    	vfmadd132ps %zmm1,%zmm1,%zmm1
    90a3:	62 f2 6d 48 98 d2    	vfmadd132ps %zmm2,%zmm2,%zmm2
    90a9:	62 f1 85 48 d4 db    	vpaddq %zmm3,%zmm15,%zmm3
    90af:	62 f2 5d 48 98 e4    	vfmadd132ps %zmm4,%zmm4,%zmm4
    90b5:	62 f2 55 48 98 ed    	vfmadd132ps %zmm5,%zmm5,%zmm5
    90bb:	62 f1 85 48 d4 f6    	vpaddq %zmm6,%zmm15,%zmm6
    90c1:	62 f2 45 48 98 ff    	vfmadd132ps %zmm7,%zmm7,%zmm7
    90c7:	62 52 3d 48 98 c0    	vfmadd132ps %zmm8,%zmm8,%zmm8
    90cd:	62 51 85 48 d4 c9    	vpaddq %zmm9,%zmm15,%zmm9
    90d3:	62 52 2d 48 98 d2    	vfmadd132ps %zmm10,%zmm10,%zmm10
    90d9:	62 52 25 48 98 db    	vfmadd132ps %zmm11,%zmm11,%zmm11
    90df:	62 51 85 48 d4 e4    	vpaddq %zmm12,%zmm15,%zmm12
    90e5:	62 52 15 48 98 ed    	vfmadd132ps %zmm13,%zmm13,%zmm13
    90eb:	62 52 0d 48 98 f6    	vfmadd132ps %zmm14,%zmm14,%zmm14
    90f1:	4c 29 cf             	sub    %r9,%rdi
    90f4:	7f a1                	jg     9097 <mixfmaadd512_loop>
    90f6:	66 48 0f 7e c8       	movq   %xmm1,%rax
    90fb:	c5 f8 77             	vzeroupper 
    90fe:	41 58                	pop    %r8
    9100:	41 59                	pop    %r9
    9102:	c3                   	retq   

0000000000009103 <mixfma512add256>:
    9103:	41 51                	push   %r9
    9105:	41 50                	push   %r8
    9107:	49 c7 c1 10 00 00 00 	mov    $0x10,%r9
    910e:	66 49 0f 6e c1       	movq   %r9,%xmm0
    9113:	c4 e2 7d 59 c0       	vpbroadcastq %xmm0,%ymm0
    9118:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    911d:	62 f2 7d 48 18 c9    	vbroadcastss %xmm1,%zmm1
    9123:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    9127:	c5 fd 6f f0          	vmovdqa %ymm0,%ymm6
    912b:	c5 7d 6f c8          	vmovdqa %ymm0,%ymm9
    912f:	c5 7d 6f e0          	vmovdqa %ymm0,%ymm12
    9133:	c5 7d 6f f8          	vmovdqa %ymm0,%ymm15
    9137:	62 f1 7c 48 28 d1    	vmovaps %zmm1,%zmm2
    913d:	62 f1 7c 48 28 e1    	vmovaps %zmm1,%zmm4
    9143:	62 f1 7c 48 28 e9    	vmovaps %zmm1,%zmm5
    9149:	62 f1 7c 48 28 f9    	vmovaps %zmm1,%zmm7
    914f:	62 71 7c 48 28 c1    	vmovaps %zmm1,%zmm8
    9155:	62 71 7c 48 28 d1    	vmovaps %zmm1,%zmm10
    915b:	62 71 7c 48 28 d9    	vmovaps %zmm1,%zmm11
    9161:	62 71 7c 48 28 e9    	vmovaps %zmm1,%zmm13
    9167:	62 71 7c 48 28 f1    	vmovaps %zmm1,%zmm14

000000000000916d <mixfma512add256_loop>:
    916d:	c5 85 d4 c0          	vpaddq %ymm0,%ymm15,%ymm0
    9171:	62 f2 75 48 98 c9    	vfmadd132ps %zmm1,%zmm1,%zmm1
    9177:	62 f2 6d 48 98 d2    	vfmadd132ps %zmm2,%zmm2,%zmm2
    917d:	c5 85 d4 db          	vpaddq %ymm3,%ymm15,%ymm3
    9181:	62 f2 5d 48 98 e4    	vfmadd132ps %zmm4,%zmm4,%zmm4
    9187:	62 f2 55 48 98 ed    	vfmadd132ps %zmm5,%zmm5,%zmm5
    918d:	c5 85 d4 f6          	vpaddq %ymm6,%ymm15,%ymm6
    9191:	62 f2 45 48 98 ff    	vfmadd132ps %zmm7,%zmm7,%zmm7
    9197:	62 52 3d 48 98 c0    	vfmadd132ps %zmm8,%zmm8,%zmm8
    919d:	c4 41 05 d4 c9       	vpaddq %ymm9,%ymm15,%ymm9
    91a2:	62 52 2d 48 98 d2    	vfmadd132ps %zmm10,%zmm10,%zmm10
    91a8:	62 52 25 48 98 db    	vfmadd132ps %zmm11,%zmm11,%zmm11
    91ae:	c4 41 05 d4 e4       	vpaddq %ymm12,%ymm15,%ymm12
    91b3:	62 52 15 48 98 ed    	vfmadd132ps %zmm13,%zmm13,%zmm13
    91b9:	62 52 0d 48 98 f6    	vfmadd132ps %zmm14,%zmm14,%zmm14
    91bf:	4c 29 cf             	sub    %r9,%rdi
    91c2:	7f a9                	jg     916d <mixfma512add256_loop>
    91c4:	66 48 0f 7e c8       	movq   %xmm1,%rax
    91c9:	c5 f8 77             	vzeroupper 
    91cc:	41 58                	pop    %r8
    91ce:	41 59                	pop    %r9
    91d0:	c3                   	retq   

00000000000091d1 <mixfmaadd256>:
    91d1:	41 51                	push   %r9
    91d3:	41 50                	push   %r8
    91d5:	49 c7 c1 10 00 00 00 	mov    $0x10,%r9
    91dc:	66 49 0f 6e c1       	movq   %r9,%xmm0
    91e1:	c4 e2 7d 59 c0       	vpbroadcastq %xmm0,%ymm0
    91e6:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    91eb:	c4 e2 7d 18 c9       	vbroadcastss %xmm1,%ymm1
    91f0:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    91f4:	c5 fd 6f f0          	vmovdqa %ymm0,%ymm6
    91f8:	c5 7d 6f c8          	vmovdqa %ymm0,%ymm9
    91fc:	c5 7d 6f e0          	vmovdqa %ymm0,%ymm12
    9200:	c5 7d 6f f8          	vmovdqa %ymm0,%ymm15
    9204:	c5 fc 28 d1          	vmovaps %ymm1,%ymm2
    9208:	c5 fc 28 e1          	vmovaps %ymm1,%ymm4
    920c:	c5 fc 28 e9          	vmovaps %ymm1,%ymm5
    9210:	c5 fc 28 f9          	vmovaps %ymm1,%ymm7
    9214:	c5 7c 28 c1          	vmovaps %ymm1,%ymm8
    9218:	c5 7c 28 d1          	vmovaps %ymm1,%ymm10
    921c:	c5 7c 28 d9          	vmovaps %ymm1,%ymm11
    9220:	c5 7c 28 e9          	vmovaps %ymm1,%ymm13
    9224:	c5 7c 28 f1          	vmovaps %ymm1,%ymm14

0000000000009228 <mixfmaadd256_loop>:
    9228:	c5 85 d4 c0          	vpaddq %ymm0,%ymm15,%ymm0
    922c:	c4 e2 75 98 c9       	vfmadd132ps %ymm1,%ymm1,%ymm1
    9231:	c4 e2 6d 98 d2       	vfmadd132ps %ymm2,%ymm2,%ymm2
    9236:	c5 85 d4 db          	vpaddq %ymm3,%ymm15,%ymm3
    923a:	c4 e2 5d 98 e4       	vfmadd132ps %ymm4,%ymm4,%ymm4
    923f:	c4 e2 55 98 ed       	vfmadd132ps %ymm5,%ymm5,%ymm5
    9244:	c5 85 d4 f6          	vpaddq %ymm6,%ymm15,%ymm6
    9248:	c4 e2 45 98 ff       	vfmadd132ps %ymm7,%ymm7,%ymm7
    924d:	c4 42 3d 98 c0       	vfmadd132ps %ymm8,%ymm8,%ymm8
    9252:	c4 41 05 d4 c9       	vpaddq %ymm9,%ymm15,%ymm9
    9257:	c4 42 2d 98 d2       	vfmadd132ps %ymm10,%ymm10,%ymm10
    925c:	c4 42 25 98 db       	vfmadd132ps %ymm11,%ymm11,%ymm11
    9261:	c4 41 05 d4 e4       	vpaddq %ymm12,%ymm15,%ymm12
    9266:	c4 42 15 98 ed       	vfmadd132ps %ymm13,%ymm13,%ymm13
    926b:	c4 42 0d 98 f6       	vfmadd132ps %ymm14,%ymm14,%ymm14
    9270:	4c 29 cf             	sub    %r9,%rdi
    9273:	7f b3                	jg     9228 <mixfmaadd256_loop>
    9275:	66 48 0f 7e c8       	movq   %xmm1,%rax
    927a:	c5 f8 77             	vzeroupper 
    927d:	41 58                	pop    %r8
    927f:	41 59                	pop    %r9
    9281:	c3                   	retq   

0000000000009282 <mixfmaand256>:
    9282:	41 51                	push   %r9
    9284:	41 50                	push   %r8
    9286:	49 c7 c1 0f 00 00 00 	mov    $0xf,%r9
    928d:	66 49 0f 6e c1       	movq   %r9,%xmm0
    9292:	c4 e2 7d 59 c0       	vpbroadcastq %xmm0,%ymm0
    9297:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    929c:	c4 e2 7d 18 c9       	vbroadcastss %xmm1,%ymm1
    92a1:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    92a5:	c5 fd 6f f0          	vmovdqa %ymm0,%ymm6
    92a9:	c5 7d 6f c8          	vmovdqa %ymm0,%ymm9
    92ad:	c5 7d 6f e0          	vmovdqa %ymm0,%ymm12
    92b1:	c5 7d 6f f8          	vmovdqa %ymm0,%ymm15
    92b5:	c5 fc 28 d1          	vmovaps %ymm1,%ymm2
    92b9:	c5 fc 28 e1          	vmovaps %ymm1,%ymm4
    92bd:	c5 fc 28 e9          	vmovaps %ymm1,%ymm5
    92c1:	c5 fc 28 f9          	vmovaps %ymm1,%ymm7
    92c5:	c5 7c 28 c1          	vmovaps %ymm1,%ymm8
    92c9:	c5 7c 28 d1          	vmovaps %ymm1,%ymm10
    92cd:	c5 7c 28 d9          	vmovaps %ymm1,%ymm11
    92d1:	c5 7c 28 e9          	vmovaps %ymm1,%ymm13
    92d5:	c5 7c 28 f1          	vmovaps %ymm1,%ymm14

00000000000092d9 <mixfmaand256_loop>:
    92d9:	c5 85 db c0          	vpand  %ymm0,%ymm15,%ymm0
    92dd:	c4 e2 75 98 c9       	vfmadd132ps %ymm1,%ymm1,%ymm1
    92e2:	c4 e2 6d 98 d2       	vfmadd132ps %ymm2,%ymm2,%ymm2
    92e7:	c5 85 db db          	vpand  %ymm3,%ymm15,%ymm3
    92eb:	c4 e2 5d 98 e4       	vfmadd132ps %ymm4,%ymm4,%ymm4
    92f0:	c4 e2 55 98 ed       	vfmadd132ps %ymm5,%ymm5,%ymm5
    92f5:	c5 85 db f6          	vpand  %ymm6,%ymm15,%ymm6
    92f9:	c4 e2 45 98 ff       	vfmadd132ps %ymm7,%ymm7,%ymm7
    92fe:	c4 42 3d 98 c0       	vfmadd132ps %ymm8,%ymm8,%ymm8
    9303:	c4 41 05 db c9       	vpand  %ymm9,%ymm15,%ymm9
    9308:	c4 42 2d 98 d2       	vfmadd132ps %ymm10,%ymm10,%ymm10
    930d:	c4 42 25 98 db       	vfmadd132ps %ymm11,%ymm11,%ymm11
    9312:	c4 41 05 db e4       	vpand  %ymm12,%ymm15,%ymm12
    9317:	c4 42 15 98 ed       	vfmadd132ps %ymm13,%ymm13,%ymm13
    931c:	c4 42 0d 98 f6       	vfmadd132ps %ymm14,%ymm14,%ymm14
    9321:	4c 29 cf             	sub    %r9,%rdi
    9324:	7f b3                	jg     92d9 <mixfmaand256_loop>
    9326:	66 48 0f 7e c8       	movq   %xmm1,%rax
    932b:	c5 f8 77             	vzeroupper 
    932e:	41 58                	pop    %r8
    9330:	41 59                	pop    %r9
    9332:	c3                   	retq   

0000000000009333 <mixfmaandmem256>:
    9333:	41 51                	push   %r9
    9335:	41 50                	push   %r8
    9337:	49 c7 c1 16 00 00 00 	mov    $0x16,%r9
    933e:	66 49 0f 6e c1       	movq   %r9,%xmm0
    9343:	c4 e2 7d 59 c0       	vpbroadcastq %xmm0,%ymm0
    9348:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    934d:	c4 e2 7d 18 c9       	vbroadcastss %xmm1,%ymm1
    9352:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    9356:	c5 fc 28 f1          	vmovaps %ymm1,%ymm6
    935a:	c5 7c 28 c9          	vmovaps %ymm1,%ymm9
    935e:	c5 7c 28 e1          	vmovaps %ymm1,%ymm12
    9362:	c5 7c 28 f9          	vmovaps %ymm1,%ymm15
    9366:	c5 fc 28 d1          	vmovaps %ymm1,%ymm2
    936a:	c5 fc 28 e1          	vmovaps %ymm1,%ymm4
    936e:	c5 fc 28 e9          	vmovaps %ymm1,%ymm5
    9372:	c5 fc 28 f9          	vmovaps %ymm1,%ymm7
    9376:	c5 7c 28 c1          	vmovaps %ymm1,%ymm8
    937a:	c5 7c 28 d1          	vmovaps %ymm1,%ymm10
    937e:	c5 7c 28 d9          	vmovaps %ymm1,%ymm11
    9382:	c5 7c 28 e9          	vmovaps %ymm1,%ymm13
    9386:	c5 7c 28 f1          	vmovaps %ymm1,%ymm14

000000000000938a <mixfmaandmem256_loop>:
    938a:	c5 fd db c0          	vpand  %ymm0,%ymm0,%ymm0
    938e:	c4 e2 75 98 c9       	vfmadd132ps %ymm1,%ymm1,%ymm1
    9393:	c4 e2 6d 98 16       	vfmadd132ps (%rsi),%ymm2,%ymm2
    9398:	c5 e5 db db          	vpand  %ymm3,%ymm3,%ymm3
    939c:	c4 e2 5d 98 e4       	vfmadd132ps %ymm4,%ymm4,%ymm4
    93a1:	c4 e2 55 98 2e       	vfmadd132ps (%rsi),%ymm5,%ymm5
    93a6:	c5 fd db c0          	vpand  %ymm0,%ymm0,%ymm0
    93aa:	c4 e2 45 98 ff       	vfmadd132ps %ymm7,%ymm7,%ymm7
    93af:	c4 62 3d 98 06       	vfmadd132ps (%rsi),%ymm8,%ymm8
    93b4:	c5 e5 db db          	vpand  %ymm3,%ymm3,%ymm3
    93b8:	c4 42 2d 98 d2       	vfmadd132ps %ymm10,%ymm10,%ymm10
    93bd:	c4 62 25 98 1e       	vfmadd132ps (%rsi),%ymm11,%ymm11
    93c2:	c5 fd db c0          	vpand  %ymm0,%ymm0,%ymm0
    93c6:	c4 42 15 98 ed       	vfmadd132ps %ymm13,%ymm13,%ymm13
    93cb:	c4 62 0d 98 36       	vfmadd132ps (%rsi),%ymm14,%ymm14
    93d0:	c5 e5 db db          	vpand  %ymm3,%ymm3,%ymm3
    93d4:	c4 e2 4d 98 f6       	vfmadd132ps %ymm6,%ymm6,%ymm6
    93d9:	c4 62 35 98 0e       	vfmadd132ps (%rsi),%ymm9,%ymm9
    93de:	c5 fd db c0          	vpand  %ymm0,%ymm0,%ymm0
    93e2:	c4 42 1d 98 e4       	vfmadd132ps %ymm12,%ymm12,%ymm12
    93e7:	c4 62 05 98 3e       	vfmadd132ps (%rsi),%ymm15,%ymm15
    93ec:	4c 29 cf             	sub    %r9,%rdi
    93ef:	7f 99                	jg     938a <mixfmaandmem256_loop>
    93f1:	66 48 0f 7e c8       	movq   %xmm1,%rax
    93f6:	c5 f8 77             	vzeroupper 
    93f9:	41 58                	pop    %r8
    93fb:	41 59                	pop    %r9
    93fd:	c3                   	retq   

00000000000093fe <mixfmaaddmem256>:
    93fe:	41 51                	push   %r9
    9400:	41 50                	push   %r8
    9402:	49 c7 c1 16 00 00 00 	mov    $0x16,%r9
    9409:	66 49 0f 6e c1       	movq   %r9,%xmm0
    940e:	c4 e2 7d 59 c0       	vpbroadcastq %xmm0,%ymm0
    9413:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    9418:	c4 e2 7d 18 c9       	vbroadcastss %xmm1,%ymm1
    941d:	c5 fd 6f d8          	vmovdqa %ymm0,%ymm3
    9421:	c5 fc 28 f1          	vmovaps %ymm1,%ymm6
    9425:	c5 7c 28 c9          	vmovaps %ymm1,%ymm9
    9429:	c5 7c 28 e1          	vmovaps %ymm1,%ymm12
    942d:	c5 7c 28 f9          	vmovaps %ymm1,%ymm15
    9431:	c5 fc 28 d1          	vmovaps %ymm1,%ymm2
    9435:	c5 fc 28 e1          	vmovaps %ymm1,%ymm4
    9439:	c5 fc 28 e9          	vmovaps %ymm1,%ymm5
    943d:	c5 fc 28 f9          	vmovaps %ymm1,%ymm7
    9441:	c5 7c 28 c1          	vmovaps %ymm1,%ymm8
    9445:	c5 7c 28 d1          	vmovaps %ymm1,%ymm10
    9449:	c5 7c 28 d9          	vmovaps %ymm1,%ymm11
    944d:	c5 7c 28 e9          	vmovaps %ymm1,%ymm13
    9451:	c5 7c 28 f1          	vmovaps %ymm1,%ymm14

0000000000009455 <mixfmaaddmem256_loop>:
    9455:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    9459:	c4 e2 75 98 c9       	vfmadd132ps %ymm1,%ymm1,%ymm1
    945e:	c4 e2 6d 98 16       	vfmadd132ps (%rsi),%ymm2,%ymm2
    9463:	c5 e5 d4 db          	vpaddq %ymm3,%ymm3,%ymm3
    9467:	c4 e2 5d 98 e4       	vfmadd132ps %ymm4,%ymm4,%ymm4
    946c:	c4 e2 55 98 2e       	vfmadd132ps (%rsi),%ymm5,%ymm5
    9471:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    9475:	c4 e2 45 98 ff       	vfmadd132ps %ymm7,%ymm7,%ymm7
    947a:	c4 62 3d 98 06       	vfmadd132ps (%rsi),%ymm8,%ymm8
    947f:	c5 e5 d4 db          	vpaddq %ymm3,%ymm3,%ymm3
    9483:	c4 42 2d 98 d2       	vfmadd132ps %ymm10,%ymm10,%ymm10
    9488:	c4 62 25 98 1e       	vfmadd132ps (%rsi),%ymm11,%ymm11
    948d:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    9491:	c4 42 15 98 ed       	vfmadd132ps %ymm13,%ymm13,%ymm13
    9496:	c4 62 0d 98 36       	vfmadd132ps (%rsi),%ymm14,%ymm14
    949b:	c5 e5 d4 db          	vpaddq %ymm3,%ymm3,%ymm3
    949f:	c4 e2 4d 98 f6       	vfmadd132ps %ymm6,%ymm6,%ymm6
    94a4:	c4 62 35 98 0e       	vfmadd132ps (%rsi),%ymm9,%ymm9
    94a9:	c5 fd d4 c0          	vpaddq %ymm0,%ymm0,%ymm0
    94ad:	c4 42 1d 98 e4       	vfmadd132ps %ymm12,%ymm12,%ymm12
    94b2:	c4 62 05 98 3e       	vfmadd132ps (%rsi),%ymm15,%ymm15
    94b7:	4c 29 cf             	sub    %r9,%rdi
    94ba:	7f 99                	jg     9455 <mixfmaaddmem256_loop>
    94bc:	66 48 0f 7e c8       	movq   %xmm1,%rax
    94c1:	c5 f8 77             	vzeroupper 
    94c4:	41 58                	pop    %r8
    94c6:	41 59                	pop    %r9
    94c8:	c3                   	retq   

00000000000094c9 <nemesfpu512mix21>:
    94c9:	41 51                	push   %r9
    94cb:	49 c7 c1 10 00 00 00 	mov    $0x10,%r9
    94d2:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0
    94d7:	62 f2 7d 48 18 c8    	vbroadcastss %xmm0,%zmm1
    94dd:	62 f1 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm2
    94e3:	62 f1 fd 48 6f d9    	vmovdqa64 %zmm1,%zmm3
    94e9:	62 f1 fd 48 6f e1    	vmovdqa64 %zmm1,%zmm4
    94ef:	62 f1 fd 48 6f e9    	vmovdqa64 %zmm1,%zmm5
    94f5:	62 f1 fd 48 6f f1    	vmovdqa64 %zmm1,%zmm6
    94fb:	62 f1 fd 48 6f f9    	vmovdqa64 %zmm1,%zmm7
    9501:	62 71 fd 48 6f c1    	vmovdqa64 %zmm1,%zmm8
    9507:	62 71 fd 48 6f c9    	vmovdqa64 %zmm1,%zmm9
    950d:	62 71 fd 48 6f d1    	vmovdqa64 %zmm1,%zmm10
    9513:	62 71 fd 48 6f d9    	vmovdqa64 %zmm1,%zmm11
    9519:	62 71 fd 48 6f e1    	vmovdqa64 %zmm1,%zmm12
    951f:	62 71 fd 48 6f e9    	vmovdqa64 %zmm1,%zmm13
    9525:	62 71 fd 48 6f f1    	vmovdqa64 %zmm1,%zmm14
    952b:	62 71 fd 48 6f f9    	vmovdqa64 %zmm1,%zmm15

0000000000009531 <nemesfpu512mix21_loop>:
    9531:	62 f1 7c 48 58 c0    	vaddps %zmm0,%zmm0,%zmm0
    9537:	62 f2 75 48 98 c9    	vfmadd132ps %zmm1,%zmm1,%zmm1
    953d:	62 f2 6d 48 98 d2    	vfmadd132ps %zmm2,%zmm2,%zmm2
    9543:	62 f1 64 48 58 db    	vaddps %zmm3,%zmm3,%zmm3
    9549:	62 f2 5d 48 98 e4    	vfmadd132ps %zmm4,%zmm4,%zmm4
    954f:	62 f2 55 48 98 ed    	vfmadd132ps %zmm5,%zmm5,%zmm5
    9555:	62 f1 4c 48 58 f6    	vaddps %zmm6,%zmm6,%zmm6
    955b:	62 f2 45 48 98 ff    	vfmadd132ps %zmm7,%zmm7,%zmm7
    9561:	62 52 3d 48 98 c0    	vfmadd132ps %zmm8,%zmm8,%zmm8
    9567:	62 51 34 48 58 c9    	vaddps %zmm9,%zmm9,%zmm9
    956d:	62 52 2d 48 98 d2    	vfmadd132ps %zmm10,%zmm10,%zmm10
    9573:	62 52 25 48 98 db    	vfmadd132ps %zmm11,%zmm11,%zmm11
    9579:	c4 41 1c 58 e4       	vaddps %ymm12,%ymm12,%ymm12
    957e:	62 52 15 48 98 ed    	vfmadd132ps %zmm13,%zmm13,%zmm13
    9584:	62 52 0d 48 98 f6    	vfmadd132ps %zmm14,%zmm14,%zmm14
    958a:	62 51 04 48 58 ff    	vaddps %zmm15,%zmm15,%zmm15
    9590:	4c 29 cf             	sub    %r9,%rdi
    9593:	7f 9c                	jg     9531 <nemesfpu512mix21_loop>
    9595:	41 59                	pop    %r9
    9597:	c3                   	retq   

0000000000009598 <nemesfpumix21>:
    9598:	41 51                	push   %r9
    959a:	49 c7 c1 10 00 00 00 	mov    $0x10,%r9
    95a1:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0
    95a6:	c4 e2 7d 18 c8       	vbroadcastss %xmm0,%ymm1
    95ab:	c5 fd 6f d1          	vmovdqa %ymm1,%ymm2
    95af:	c5 fd 6f d9          	vmovdqa %ymm1,%ymm3
    95b3:	c5 fd 6f e1          	vmovdqa %ymm1,%ymm4
    95b7:	c5 fd 6f e9          	vmovdqa %ymm1,%ymm5
    95bb:	c5 fd 6f f1          	vmovdqa %ymm1,%ymm6
    95bf:	c5 fd 6f f9          	vmovdqa %ymm1,%ymm7
    95c3:	c5 7d 6f c1          	vmovdqa %ymm1,%ymm8
    95c7:	c5 7d 6f c9          	vmovdqa %ymm1,%ymm9
    95cb:	c5 7d 6f d1          	vmovdqa %ymm1,%ymm10
    95cf:	c5 7d 6f d9          	vmovdqa %ymm1,%ymm11
    95d3:	c5 7d 6f e1          	vmovdqa %ymm1,%ymm12
    95d7:	c5 7d 6f e9          	vmovdqa %ymm1,%ymm13
    95db:	c5 7d 6f f1          	vmovdqa %ymm1,%ymm14
    95df:	c5 7d 6f f9          	vmovdqa %ymm1,%ymm15

00000000000095e3 <nemesfpumix21_loop>:
    95e3:	c5 fc 58 c0          	vaddps %ymm0,%ymm0,%ymm0
    95e7:	c4 e2 75 98 c9       	vfmadd132ps %ymm1,%ymm1,%ymm1
    95ec:	c4 e2 6d 98 d2       	vfmadd132ps %ymm2,%ymm2,%ymm2
    95f1:	c5 e4 58 db          	vaddps %ymm3,%ymm3,%ymm3
    95f5:	c4 e2 5d 98 e4       	vfmadd132ps %ymm4,%ymm4,%ymm4
    95fa:	c4 e2 55 98 ed       	vfmadd132ps %ymm5,%ymm5,%ymm5
    95ff:	c5 cc 58 f6          	vaddps %ymm6,%ymm6,%ymm6
    9603:	c4 e2 45 98 ff       	vfmadd132ps %ymm7,%ymm7,%ymm7
    9608:	c4 42 3d 98 c0       	vfmadd132ps %ymm8,%ymm8,%ymm8
    960d:	c4 41 34 58 c9       	vaddps %ymm9,%ymm9,%ymm9
    9612:	c4 42 2d 98 d2       	vfmadd132ps %ymm10,%ymm10,%ymm10
    9617:	c4 42 25 98 db       	vfmadd132ps %ymm11,%ymm11,%ymm11
    961c:	c4 41 1c 58 e4       	vaddps %ymm12,%ymm12,%ymm12
    9621:	c4 42 15 98 ed       	vfmadd132ps %ymm13,%ymm13,%ymm13
    9626:	c4 42 0d 98 f6       	vfmadd132ps %ymm14,%ymm14,%ymm14
    962b:	c4 41 04 58 ff       	vaddps %ymm15,%ymm15,%ymm15
    9630:	4c 29 cf             	sub    %r9,%rdi
    9633:	7f ae                	jg     95e3 <nemesfpumix21_loop>
    9635:	41 59                	pop    %r9
    9637:	c3                   	retq   

0000000000009638 <latfma512>:
    9638:	41 51                	push   %r9
    963a:	41 50                	push   %r8
    963c:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9643:	66 49 0f 6e c9       	movq   %r9,%xmm1
    9648:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    964d:	62 f2 7d 48 18 f6    	vbroadcastss %xmm6,%zmm6
    9653:	62 f1 7c 48 10 ee    	vmovups %zmm6,%zmm5
    9659:	62 f1 7c 48 10 fe    	vmovups %zmm6,%zmm7
    965f:	62 71 7c 48 10 c6    	vmovups %zmm6,%zmm8
    9665:	62 71 7c 48 10 ce    	vmovups %zmm6,%zmm9
    966b:	62 71 7c 48 10 d6    	vmovups %zmm6,%zmm10
    9671:	62 71 7c 48 10 de    	vmovups %zmm6,%zmm11
    9677:	62 71 7c 48 10 e6    	vmovups %zmm6,%zmm12
    967d:	62 71 7c 48 10 ee    	vmovups %zmm6,%zmm13
    9683:	62 71 7c 48 10 f6    	vmovups %zmm6,%zmm14
    9689:	62 71 7c 48 10 fe    	vmovups %zmm6,%zmm15

000000000000968f <latfma512_loop>:
    968f:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    9695:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    969b:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96a1:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96a7:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96ad:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96b3:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96b9:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96bf:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96c5:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96cb:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96d1:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96d7:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96dd:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96e3:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96e9:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96ef:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96f5:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    96fb:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    9701:	62 f2 55 48 98 fe    	vfmadd132ps %zmm6,%zmm5,%zmm7
    9707:	4c 29 cf             	sub    %r9,%rdi
    970a:	75 83                	jne    968f <latfma512_loop>
    970c:	66 48 0f 7e c8       	movq   %xmm1,%rax
    9711:	c5 f8 77             	vzeroupper 
    9714:	41 58                	pop    %r8
    9716:	41 59                	pop    %r9
    9718:	c3                   	retq   

0000000000009719 <latfma256>:
    9719:	41 51                	push   %r9
    971b:	41 50                	push   %r8
    971d:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9724:	66 49 0f 6e c9       	movq   %r9,%xmm1
    9729:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    972e:	c4 e2 7d 18 f6       	vbroadcastss %xmm6,%ymm6
    9733:	c5 fc 10 ee          	vmovups %ymm6,%ymm5
    9737:	c5 fc 10 fe          	vmovups %ymm6,%ymm7
    973b:	c5 7c 10 c6          	vmovups %ymm6,%ymm8
    973f:	c5 7c 10 ce          	vmovups %ymm6,%ymm9
    9743:	c5 7c 10 d6          	vmovups %ymm6,%ymm10
    9747:	c5 7c 10 de          	vmovups %ymm6,%ymm11
    974b:	c5 7c 10 e6          	vmovups %ymm6,%ymm12
    974f:	c5 7c 10 ee          	vmovups %ymm6,%ymm13
    9753:	c5 7c 10 f6          	vmovups %ymm6,%ymm14
    9757:	c5 7c 10 fe          	vmovups %ymm6,%ymm15

000000000000975b <latfma256_loop>:
    975b:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9760:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9765:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    976a:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    976f:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9774:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9779:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    977e:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9783:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9788:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    978d:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9792:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    9797:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    979c:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97a1:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97a6:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97ab:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97b0:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97b5:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97ba:	c4 e2 55 98 fe       	vfmadd132ps %ymm6,%ymm5,%ymm7
    97bf:	4c 29 cf             	sub    %r9,%rdi
    97c2:	75 97                	jne    975b <latfma256_loop>
    97c4:	66 48 0f 7e c8       	movq   %xmm1,%rax
    97c9:	c5 f8 77             	vzeroupper 
    97cc:	41 58                	pop    %r8
    97ce:	41 59                	pop    %r9
    97d0:	c3                   	retq   

00000000000097d1 <latfma128>:
    97d1:	41 51                	push   %r9
    97d3:	41 50                	push   %r8
    97d5:	c5 f8 77             	vzeroupper 
    97d8:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    97df:	66 49 0f 6e c9       	movq   %r9,%xmm1
    97e4:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6
    97e9:	c4 e2 79 18 f6       	vbroadcastss %xmm6,%xmm6
    97ee:	c5 f8 10 ee          	vmovups %xmm6,%xmm5
    97f2:	c5 f8 10 fe          	vmovups %xmm6,%xmm7
    97f6:	c5 78 10 c6          	vmovups %xmm6,%xmm8
    97fa:	c5 78 10 ce          	vmovups %xmm6,%xmm9
    97fe:	c5 78 10 d6          	vmovups %xmm6,%xmm10
    9802:	c5 78 10 de          	vmovups %xmm6,%xmm11
    9806:	c5 78 10 e6          	vmovups %xmm6,%xmm12
    980a:	c5 78 10 ee          	vmovups %xmm6,%xmm13
    980e:	c5 78 10 f6          	vmovups %xmm6,%xmm14
    9812:	c5 78 10 fe          	vmovups %xmm6,%xmm15

0000000000009816 <latfma128_loop>:
    9816:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    981b:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9820:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9825:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    982a:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    982f:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9834:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9839:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    983e:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9843:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9848:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    984d:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9852:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9857:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    985c:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9861:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9866:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    986b:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9870:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    9875:	c4 e2 51 98 fe       	vfmadd132ps %xmm6,%xmm5,%xmm7
    987a:	4c 29 cf             	sub    %r9,%rdi
    987d:	75 97                	jne    9816 <latfma128_loop>
    987f:	66 48 0f 7e c8       	movq   %xmm1,%rax
    9884:	c5 f8 77             	vzeroupper 
    9887:	41 58                	pop    %r8
    9889:	41 59                	pop    %r9
    988b:	c3                   	retq   

000000000000988c <latadd128fp>:
    988c:	41 51                	push   %r9
    988e:	41 50                	push   %r8
    9890:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9897:	66 49 0f 6e c9       	movq   %r9,%xmm1
    989c:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6

00000000000098a1 <latadd128fp_loop>:
    98a1:	0f 58 f6             	addps  %xmm6,%xmm6
    98a4:	0f 58 f6             	addps  %xmm6,%xmm6
    98a7:	0f 58 f6             	addps  %xmm6,%xmm6
    98aa:	0f 58 f6             	addps  %xmm6,%xmm6
    98ad:	0f 58 f6             	addps  %xmm6,%xmm6
    98b0:	0f 58 f6             	addps  %xmm6,%xmm6
    98b3:	0f 58 f6             	addps  %xmm6,%xmm6
    98b6:	0f 58 f6             	addps  %xmm6,%xmm6
    98b9:	0f 58 f6             	addps  %xmm6,%xmm6
    98bc:	0f 58 f6             	addps  %xmm6,%xmm6
    98bf:	0f 58 f6             	addps  %xmm6,%xmm6
    98c2:	0f 58 f6             	addps  %xmm6,%xmm6
    98c5:	0f 58 f6             	addps  %xmm6,%xmm6
    98c8:	0f 58 f6             	addps  %xmm6,%xmm6
    98cb:	0f 58 f6             	addps  %xmm6,%xmm6
    98ce:	0f 58 f6             	addps  %xmm6,%xmm6
    98d1:	0f 58 f6             	addps  %xmm6,%xmm6
    98d4:	0f 58 f6             	addps  %xmm6,%xmm6
    98d7:	0f 58 f6             	addps  %xmm6,%xmm6
    98da:	0f 58 f6             	addps  %xmm6,%xmm6
    98dd:	4c 29 cf             	sub    %r9,%rdi
    98e0:	75 bf                	jne    98a1 <latadd128fp_loop>
    98e2:	66 48 0f 7e c8       	movq   %xmm1,%rax
    98e7:	41 58                	pop    %r8
    98e9:	41 59                	pop    %r9
    98eb:	c3                   	retq   

00000000000098ec <latmul128fp>:
    98ec:	41 51                	push   %r9
    98ee:	41 50                	push   %r8
    98f0:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    98f7:	66 49 0f 6e c9       	movq   %r9,%xmm1
    98fc:	f3 49 0f 2a f1       	cvtsi2ss %r9,%xmm6

0000000000009901 <latmul128fp_loop>:
    9901:	0f 59 f6             	mulps  %xmm6,%xmm6
    9904:	0f 59 f6             	mulps  %xmm6,%xmm6
    9907:	0f 59 f6             	mulps  %xmm6,%xmm6
    990a:	0f 59 f6             	mulps  %xmm6,%xmm6
    990d:	0f 59 f6             	mulps  %xmm6,%xmm6
    9910:	0f 59 f6             	mulps  %xmm6,%xmm6
    9913:	0f 59 f6             	mulps  %xmm6,%xmm6
    9916:	0f 59 f6             	mulps  %xmm6,%xmm6
    9919:	0f 59 f6             	mulps  %xmm6,%xmm6
    991c:	0f 59 f6             	mulps  %xmm6,%xmm6
    991f:	0f 59 f6             	mulps  %xmm6,%xmm6
    9922:	0f 59 f6             	mulps  %xmm6,%xmm6
    9925:	0f 59 f6             	mulps  %xmm6,%xmm6
    9928:	0f 59 f6             	mulps  %xmm6,%xmm6
    992b:	0f 59 f6             	mulps  %xmm6,%xmm6
    992e:	0f 59 f6             	mulps  %xmm6,%xmm6
    9931:	0f 59 f6             	mulps  %xmm6,%xmm6
    9934:	0f 59 f6             	mulps  %xmm6,%xmm6
    9937:	0f 59 f6             	mulps  %xmm6,%xmm6
    993a:	0f 59 f6             	mulps  %xmm6,%xmm6
    993d:	4c 29 cf             	sub    %r9,%rdi
    9940:	75 bf                	jne    9901 <latmul128fp_loop>
    9942:	66 48 0f 7e c8       	movq   %xmm1,%rax
    9947:	41 58                	pop    %r8
    9949:	41 59                	pop    %r9
    994b:	c3                   	retq   

000000000000994c <mul128fp>:
    994c:	41 51                	push   %r9
    994e:	41 50                	push   %r8
    9950:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9957:	f3 49 0f 2a e1       	cvtsi2ss %r9,%xmm4
    995c:	f3 49 0f 2a d9       	cvtsi2ss %r9,%xmm3
    9961:	f3 49 0f 2a d1       	cvtsi2ss %r9,%xmm2
    9966:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    996b:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0

0000000000009970 <mul128fp_loop>:
    9970:	0f 59 c0             	mulps  %xmm0,%xmm0
    9973:	0f 59 c9             	mulps  %xmm1,%xmm1
    9976:	0f 59 d2             	mulps  %xmm2,%xmm2
    9979:	0f 59 db             	mulps  %xmm3,%xmm3
    997c:	0f 59 e4             	mulps  %xmm4,%xmm4
    997f:	0f 59 c0             	mulps  %xmm0,%xmm0
    9982:	0f 59 c9             	mulps  %xmm1,%xmm1
    9985:	0f 59 d2             	mulps  %xmm2,%xmm2
    9988:	0f 59 db             	mulps  %xmm3,%xmm3
    998b:	0f 59 e4             	mulps  %xmm4,%xmm4
    998e:	0f 59 c0             	mulps  %xmm0,%xmm0
    9991:	0f 59 c9             	mulps  %xmm1,%xmm1
    9994:	0f 59 d2             	mulps  %xmm2,%xmm2
    9997:	0f 59 db             	mulps  %xmm3,%xmm3
    999a:	0f 59 e4             	mulps  %xmm4,%xmm4
    999d:	0f 59 c0             	mulps  %xmm0,%xmm0
    99a0:	0f 59 c9             	mulps  %xmm1,%xmm1
    99a3:	0f 59 d2             	mulps  %xmm2,%xmm2
    99a6:	0f 59 db             	mulps  %xmm3,%xmm3
    99a9:	0f 59 e4             	mulps  %xmm4,%xmm4
    99ac:	4c 29 cf             	sub    %r9,%rdi
    99af:	75 bf                	jne    9970 <mul128fp_loop>
    99b1:	66 48 0f 7e c8       	movq   %xmm1,%rax
    99b6:	41 58                	pop    %r8
    99b8:	41 59                	pop    %r9
    99ba:	c3                   	retq   

00000000000099bb <add128fp>:
    99bb:	41 51                	push   %r9
    99bd:	41 50                	push   %r8
    99bf:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    99c6:	f3 49 0f 2a e1       	cvtsi2ss %r9,%xmm4
    99cb:	f3 49 0f 2a d9       	cvtsi2ss %r9,%xmm3
    99d0:	f3 49 0f 2a d1       	cvtsi2ss %r9,%xmm2
    99d5:	f3 49 0f 2a c9       	cvtsi2ss %r9,%xmm1
    99da:	f3 49 0f 2a c1       	cvtsi2ss %r9,%xmm0

00000000000099df <add128fp_loop>:
    99df:	0f 58 c0             	addps  %xmm0,%xmm0
    99e2:	0f 58 c9             	addps  %xmm1,%xmm1
    99e5:	0f 58 d2             	addps  %xmm2,%xmm2
    99e8:	0f 58 db             	addps  %xmm3,%xmm3
    99eb:	0f 58 e4             	addps  %xmm4,%xmm4
    99ee:	0f 58 c0             	addps  %xmm0,%xmm0
    99f1:	0f 58 c9             	addps  %xmm1,%xmm1
    99f4:	0f 58 d2             	addps  %xmm2,%xmm2
    99f7:	0f 58 db             	addps  %xmm3,%xmm3
    99fa:	0f 58 e4             	addps  %xmm4,%xmm4
    99fd:	0f 58 c0             	addps  %xmm0,%xmm0
    9a00:	0f 58 c9             	addps  %xmm1,%xmm1
    9a03:	0f 58 d2             	addps  %xmm2,%xmm2
    9a06:	0f 58 db             	addps  %xmm3,%xmm3
    9a09:	0f 58 e4             	addps  %xmm4,%xmm4
    9a0c:	0f 58 c0             	addps  %xmm0,%xmm0
    9a0f:	0f 58 c9             	addps  %xmm1,%xmm1
    9a12:	0f 58 d2             	addps  %xmm2,%xmm2
    9a15:	0f 58 db             	addps  %xmm3,%xmm3
    9a18:	0f 58 e4             	addps  %xmm4,%xmm4
    9a1b:	4c 29 cf             	sub    %r9,%rdi
    9a1e:	75 bf                	jne    99df <add128fp_loop>
    9a20:	66 48 0f 7e c8       	movq   %xmm1,%rax
    9a25:	41 58                	pop    %r8
    9a27:	41 59                	pop    %r9
    9a29:	c3                   	retq   

0000000000009a2a <latmul64>:
    9a2a:	53                   	push   %rbx
    9a2b:	51                   	push   %rcx
    9a2c:	41 50                	push   %r8
    9a2e:	41 51                	push   %r9
    9a30:	41 52                	push   %r10
    9a32:	41 53                	push   %r11
    9a34:	41 54                	push   %r12
    9a36:	41 55                	push   %r13
    9a38:	41 56                	push   %r14
    9a3a:	41 57                	push   %r15
    9a3c:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9a43:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9a4a:	4c 89 c3             	mov    %r8,%rbx
    9a4d:	4c 89 c1             	mov    %r8,%rcx
    9a50:	4d 89 c2             	mov    %r8,%r10
    9a53:	4d 89 c3             	mov    %r8,%r11
    9a56:	4d 89 c4             	mov    %r8,%r12
    9a59:	4d 89 c5             	mov    %r8,%r13
    9a5c:	4d 89 c6             	mov    %r8,%r14
    9a5f:	4d 89 cf             	mov    %r9,%r15

0000000000009a62 <latmul64_loop>:
    9a62:	4d 0f af f9          	imul   %r9,%r15
    9a66:	4d 0f af f9          	imul   %r9,%r15
    9a6a:	4d 0f af f9          	imul   %r9,%r15
    9a6e:	4d 0f af f9          	imul   %r9,%r15
    9a72:	4d 0f af f9          	imul   %r9,%r15
    9a76:	4d 0f af f9          	imul   %r9,%r15
    9a7a:	4d 0f af f9          	imul   %r9,%r15
    9a7e:	4d 0f af f9          	imul   %r9,%r15
    9a82:	4d 0f af f9          	imul   %r9,%r15
    9a86:	4d 0f af f9          	imul   %r9,%r15
    9a8a:	4d 0f af f9          	imul   %r9,%r15
    9a8e:	4d 0f af f9          	imul   %r9,%r15
    9a92:	4d 0f af f9          	imul   %r9,%r15
    9a96:	4d 0f af f9          	imul   %r9,%r15
    9a9a:	4d 0f af f9          	imul   %r9,%r15
    9a9e:	4d 0f af f9          	imul   %r9,%r15
    9aa2:	4d 0f af f9          	imul   %r9,%r15
    9aa6:	4d 0f af f9          	imul   %r9,%r15
    9aaa:	4d 0f af f9          	imul   %r9,%r15
    9aae:	4d 0f af f9          	imul   %r9,%r15
    9ab2:	4c 29 cf             	sub    %r9,%rdi
    9ab5:	75 ab                	jne    9a62 <latmul64_loop>
    9ab7:	41 5f                	pop    %r15
    9ab9:	41 5e                	pop    %r14
    9abb:	41 5d                	pop    %r13
    9abd:	41 5c                	pop    %r12
    9abf:	41 5b                	pop    %r11
    9ac1:	41 5a                	pop    %r10
    9ac3:	41 59                	pop    %r9
    9ac5:	41 58                	pop    %r8
    9ac7:	59                   	pop    %rcx
    9ac8:	5b                   	pop    %rbx
    9ac9:	c3                   	retq   

0000000000009aca <latmul16>:
    9aca:	53                   	push   %rbx
    9acb:	51                   	push   %rcx
    9acc:	41 50                	push   %r8
    9ace:	41 51                	push   %r9
    9ad0:	41 52                	push   %r10
    9ad2:	41 53                	push   %r11
    9ad4:	41 54                	push   %r12
    9ad6:	41 55                	push   %r13
    9ad8:	41 56                	push   %r14
    9ada:	41 57                	push   %r15
    9adc:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9ae3:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9aea:	4c 89 c3             	mov    %r8,%rbx
    9aed:	4c 89 c1             	mov    %r8,%rcx
    9af0:	4d 89 c2             	mov    %r8,%r10
    9af3:	4d 89 c3             	mov    %r8,%r11
    9af6:	4d 89 c4             	mov    %r8,%r12
    9af9:	4d 89 c5             	mov    %r8,%r13
    9afc:	4d 89 c6             	mov    %r8,%r14
    9aff:	4d 89 cf             	mov    %r9,%r15

0000000000009b02 <latmul16_loop>:
    9b02:	66 45 0f af f9       	imul   %r9w,%r15w
    9b07:	66 45 0f af f9       	imul   %r9w,%r15w
    9b0c:	66 45 0f af f9       	imul   %r9w,%r15w
    9b11:	66 45 0f af f9       	imul   %r9w,%r15w
    9b16:	66 45 0f af f9       	imul   %r9w,%r15w
    9b1b:	66 45 0f af f9       	imul   %r9w,%r15w
    9b20:	66 45 0f af f9       	imul   %r9w,%r15w
    9b25:	66 45 0f af f9       	imul   %r9w,%r15w
    9b2a:	66 45 0f af f9       	imul   %r9w,%r15w
    9b2f:	66 45 0f af f9       	imul   %r9w,%r15w
    9b34:	66 45 0f af f9       	imul   %r9w,%r15w
    9b39:	66 45 0f af f9       	imul   %r9w,%r15w
    9b3e:	66 45 0f af f9       	imul   %r9w,%r15w
    9b43:	66 45 0f af f9       	imul   %r9w,%r15w
    9b48:	66 45 0f af f9       	imul   %r9w,%r15w
    9b4d:	66 45 0f af f9       	imul   %r9w,%r15w
    9b52:	66 45 0f af f9       	imul   %r9w,%r15w
    9b57:	66 45 0f af f9       	imul   %r9w,%r15w
    9b5c:	66 45 0f af f9       	imul   %r9w,%r15w
    9b61:	66 45 0f af f9       	imul   %r9w,%r15w
    9b66:	4c 29 cf             	sub    %r9,%rdi
    9b69:	75 97                	jne    9b02 <latmul16_loop>
    9b6b:	41 5f                	pop    %r15
    9b6d:	41 5e                	pop    %r14
    9b6f:	41 5d                	pop    %r13
    9b71:	41 5c                	pop    %r12
    9b73:	41 5b                	pop    %r11
    9b75:	41 5a                	pop    %r10
    9b77:	41 59                	pop    %r9
    9b79:	41 58                	pop    %r8
    9b7b:	59                   	pop    %rcx
    9b7c:	5b                   	pop    %rbx
    9b7d:	c3                   	retq   

0000000000009b7e <mul16>:
    9b7e:	53                   	push   %rbx
    9b7f:	51                   	push   %rcx
    9b80:	41 50                	push   %r8
    9b82:	41 51                	push   %r9
    9b84:	41 52                	push   %r10
    9b86:	41 53                	push   %r11
    9b88:	41 54                	push   %r12
    9b8a:	41 55                	push   %r13
    9b8c:	41 56                	push   %r14
    9b8e:	41 57                	push   %r15
    9b90:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9b97:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9b9e:	4c 89 c3             	mov    %r8,%rbx
    9ba1:	4c 89 c1             	mov    %r8,%rcx
    9ba4:	4d 89 c2             	mov    %r8,%r10
    9ba7:	4d 89 c3             	mov    %r8,%r11
    9baa:	4d 89 c4             	mov    %r8,%r12
    9bad:	4d 89 c5             	mov    %r8,%r13
    9bb0:	4d 89 c6             	mov    %r8,%r14
    9bb3:	4d 89 cf             	mov    %r9,%r15

0000000000009bb6 <mul16_loop>:
    9bb6:	66 45 0f af f9       	imul   %r9w,%r15w
    9bbb:	66 45 0f af f1       	imul   %r9w,%r14w
    9bc0:	66 45 0f af e9       	imul   %r9w,%r13w
    9bc5:	66 45 0f af e1       	imul   %r9w,%r12w
    9bca:	66 45 0f af d9       	imul   %r9w,%r11w
    9bcf:	66 45 0f af f9       	imul   %r9w,%r15w
    9bd4:	66 45 0f af f1       	imul   %r9w,%r14w
    9bd9:	66 45 0f af e9       	imul   %r9w,%r13w
    9bde:	66 45 0f af e1       	imul   %r9w,%r12w
    9be3:	66 45 0f af d9       	imul   %r9w,%r11w
    9be8:	66 45 0f af f9       	imul   %r9w,%r15w
    9bed:	66 45 0f af f1       	imul   %r9w,%r14w
    9bf2:	66 45 0f af e9       	imul   %r9w,%r13w
    9bf7:	66 45 0f af e1       	imul   %r9w,%r12w
    9bfc:	66 45 0f af d9       	imul   %r9w,%r11w
    9c01:	66 45 0f af f9       	imul   %r9w,%r15w
    9c06:	66 45 0f af f1       	imul   %r9w,%r14w
    9c0b:	66 45 0f af e9       	imul   %r9w,%r13w
    9c10:	66 45 0f af e1       	imul   %r9w,%r12w
    9c15:	66 45 0f af d9       	imul   %r9w,%r11w
    9c1a:	4c 29 cf             	sub    %r9,%rdi
    9c1d:	75 97                	jne    9bb6 <mul16_loop>
    9c1f:	41 5f                	pop    %r15
    9c21:	41 5e                	pop    %r14
    9c23:	41 5d                	pop    %r13
    9c25:	41 5c                	pop    %r12
    9c27:	41 5b                	pop    %r11
    9c29:	41 5a                	pop    %r10
    9c2b:	41 59                	pop    %r9
    9c2d:	41 58                	pop    %r8
    9c2f:	59                   	pop    %rcx
    9c30:	5b                   	pop    %rbx
    9c31:	c3                   	retq   

0000000000009c32 <mul64>:
    9c32:	53                   	push   %rbx
    9c33:	51                   	push   %rcx
    9c34:	56                   	push   %rsi
    9c35:	41 50                	push   %r8
    9c37:	41 51                	push   %r9
    9c39:	41 52                	push   %r10
    9c3b:	41 53                	push   %r11
    9c3d:	41 54                	push   %r12
    9c3f:	41 55                	push   %r13
    9c41:	41 56                	push   %r14
    9c43:	41 57                	push   %r15
    9c45:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9c4c:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9c53:	4c 89 c3             	mov    %r8,%rbx
    9c56:	4c 89 c1             	mov    %r8,%rcx
    9c59:	4d 89 c2             	mov    %r8,%r10
    9c5c:	4d 89 c3             	mov    %r8,%r11
    9c5f:	4d 89 c4             	mov    %r8,%r12
    9c62:	4d 89 c5             	mov    %r8,%r13
    9c65:	4d 89 c6             	mov    %r8,%r14
    9c68:	4d 89 cf             	mov    %r9,%r15

0000000000009c6b <mul64_loop>:
    9c6b:	4d 0f af f9          	imul   %r9,%r15
    9c6f:	4d 89 cf             	mov    %r9,%r15
    9c72:	4d 0f af f1          	imul   %r9,%r14
    9c76:	4d 89 ce             	mov    %r9,%r14
    9c79:	4d 0f af e9          	imul   %r9,%r13
    9c7d:	4d 89 cd             	mov    %r9,%r13
    9c80:	4d 0f af e1          	imul   %r9,%r12
    9c84:	4d 89 cc             	mov    %r9,%r12
    9c87:	4d 0f af d9          	imul   %r9,%r11
    9c8b:	4d 89 cb             	mov    %r9,%r11
    9c8e:	4d 0f af d1          	imul   %r9,%r10
    9c92:	4d 89 ca             	mov    %r9,%r10
    9c95:	4d 0f af c1          	imul   %r9,%r8
    9c99:	4d 89 c8             	mov    %r9,%r8
    9c9c:	49 0f af d9          	imul   %r9,%rbx
    9ca0:	4c 89 cb             	mov    %r9,%rbx
    9ca3:	49 0f af c9          	imul   %r9,%rcx
    9ca7:	4c 89 c9             	mov    %r9,%rcx
    9caa:	49 0f af f1          	imul   %r9,%rsi
    9cae:	4c 89 ce             	mov    %r9,%rsi
    9cb1:	4d 0f af f9          	imul   %r9,%r15
    9cb5:	4d 89 cf             	mov    %r9,%r15
    9cb8:	4d 0f af f1          	imul   %r9,%r14
    9cbc:	4d 89 ce             	mov    %r9,%r14
    9cbf:	4d 0f af e9          	imul   %r9,%r13
    9cc3:	4d 89 cd             	mov    %r9,%r13
    9cc6:	4d 0f af e1          	imul   %r9,%r12
    9cca:	4d 89 cc             	mov    %r9,%r12
    9ccd:	4d 0f af d9          	imul   %r9,%r11
    9cd1:	4d 89 cb             	mov    %r9,%r11
    9cd4:	4d 0f af d1          	imul   %r9,%r10
    9cd8:	4d 89 ca             	mov    %r9,%r10
    9cdb:	4d 0f af c1          	imul   %r9,%r8
    9cdf:	4d 89 c8             	mov    %r9,%r8
    9ce2:	49 0f af d9          	imul   %r9,%rbx
    9ce6:	4c 89 cb             	mov    %r9,%rbx
    9ce9:	49 0f af c9          	imul   %r9,%rcx
    9ced:	4c 89 c9             	mov    %r9,%rcx
    9cf0:	49 0f af f1          	imul   %r9,%rsi
    9cf4:	4c 89 ce             	mov    %r9,%rsi
    9cf7:	4c 29 cf             	sub    %r9,%rdi
    9cfa:	0f 85 6b ff ff ff    	jne    9c6b <mul64_loop>
    9d00:	41 5f                	pop    %r15
    9d02:	41 5e                	pop    %r14
    9d04:	41 5d                	pop    %r13
    9d06:	41 5c                	pop    %r12
    9d08:	41 5b                	pop    %r11
    9d0a:	41 5a                	pop    %r10
    9d0c:	41 59                	pop    %r9
    9d0e:	41 58                	pop    %r8
    9d10:	5e                   	pop    %rsi
    9d11:	59                   	pop    %rcx
    9d12:	5b                   	pop    %rbx
    9d13:	c3                   	retq   

0000000000009d14 <mixmul16mul64>:
    9d14:	53                   	push   %rbx
    9d15:	51                   	push   %rcx
    9d16:	56                   	push   %rsi
    9d17:	41 50                	push   %r8
    9d19:	41 51                	push   %r9
    9d1b:	41 52                	push   %r10
    9d1d:	41 53                	push   %r11
    9d1f:	41 54                	push   %r12
    9d21:	41 55                	push   %r13
    9d23:	41 56                	push   %r14
    9d25:	41 57                	push   %r15
    9d27:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9d2e:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    9d35:	4c 89 c3             	mov    %r8,%rbx
    9d38:	4c 89 c1             	mov    %r8,%rcx
    9d3b:	4d 89 c2             	mov    %r8,%r10
    9d3e:	4d 89 c3             	mov    %r8,%r11
    9d41:	4d 89 c4             	mov    %r8,%r12
    9d44:	4d 89 c5             	mov    %r8,%r13
    9d47:	4d 89 c6             	mov    %r8,%r14
    9d4a:	4d 89 cf             	mov    %r9,%r15

0000000000009d4d <mixmul16mul64_loop>:
    9d4d:	4d 0f af f9          	imul   %r9,%r15
    9d51:	66 45 0f af f1       	imul   %r9w,%r14w
    9d56:	4d 0f af e9          	imul   %r9,%r13
    9d5a:	66 45 0f af e1       	imul   %r9w,%r12w
    9d5f:	4d 0f af d9          	imul   %r9,%r11
    9d63:	66 45 0f af d1       	imul   %r9w,%r10w
    9d68:	4d 0f af c1          	imul   %r9,%r8
    9d6c:	66 41 0f af d9       	imul   %r9w,%bx
    9d71:	49 0f af c9          	imul   %r9,%rcx
    9d75:	66 41 0f af f1       	imul   %r9w,%si
    9d7a:	4d 0f af f9          	imul   %r9,%r15
    9d7e:	66 45 0f af f1       	imul   %r9w,%r14w
    9d83:	4d 0f af e9          	imul   %r9,%r13
    9d87:	66 45 0f af e1       	imul   %r9w,%r12w
    9d8c:	4d 0f af d9          	imul   %r9,%r11
    9d90:	66 45 0f af d1       	imul   %r9w,%r10w
    9d95:	4d 0f af c1          	imul   %r9,%r8
    9d99:	66 41 0f af d9       	imul   %r9w,%bx
    9d9e:	49 0f af c9          	imul   %r9,%rcx
    9da2:	66 41 0f af f1       	imul   %r9w,%si
    9da7:	4c 29 cf             	sub    %r9,%rdi
    9daa:	75 a1                	jne    9d4d <mixmul16mul64_loop>
    9dac:	41 5f                	pop    %r15
    9dae:	41 5e                	pop    %r14
    9db0:	41 5d                	pop    %r13
    9db2:	41 5c                	pop    %r12
    9db4:	41 5b                	pop    %r11
    9db6:	41 5a                	pop    %r10
    9db8:	41 59                	pop    %r9
    9dba:	41 58                	pop    %r8
    9dbc:	5e                   	pop    %rsi
    9dbd:	59                   	pop    %rcx
    9dbe:	5b                   	pop    %rbx
    9dbf:	c3                   	retq   

0000000000009dc0 <mixmul16mul64_21>:
    9dc0:	53                   	push   %rbx
    9dc1:	51                   	push   %rcx
    9dc2:	52                   	push   %rdx
    9dc3:	56                   	push   %rsi
    9dc4:	41 50                	push   %r8
    9dc6:	41 51                	push   %r9
    9dc8:	41 52                	push   %r10
    9dca:	41 53                	push   %r11
    9dcc:	41 54                	push   %r12
    9dce:	41 55                	push   %r13
    9dd0:	41 56                	push   %r14
    9dd2:	41 57                	push   %r15
    9dd4:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    9ddb:	49 c7 c1 18 00 00 00 	mov    $0x18,%r9
    9de2:	4c 89 c3             	mov    %r8,%rbx
    9de5:	4c 89 c1             	mov    %r8,%rcx
    9de8:	4c 89 c6             	mov    %r8,%rsi
    9deb:	4d 89 c2             	mov    %r8,%r10
    9dee:	4d 89 c3             	mov    %r8,%r11
    9df1:	4d 89 c4             	mov    %r8,%r12
    9df4:	4d 89 c5             	mov    %r8,%r13
    9df7:	4d 89 c6             	mov    %r8,%r14
    9dfa:	4d 89 cf             	mov    %r9,%r15

0000000000009dfd <mixmul16mul64_21_loop>:
    9dfd:	4d 0f af f9          	imul   %r9,%r15
    9e01:	66 45 0f af f1       	imul   %r9w,%r14w
    9e06:	66 45 0f af e9       	imul   %r9w,%r13w
    9e0b:	4d 0f af e1          	imul   %r9,%r12
    9e0f:	66 45 0f af d9       	imul   %r9w,%r11w
    9e14:	66 45 0f af d1       	imul   %r9w,%r10w
    9e19:	4d 0f af c1          	imul   %r9,%r8
    9e1d:	66 45 0f af f1       	imul   %r9w,%r14w
    9e22:	66 45 0f af e9       	imul   %r9w,%r13w
    9e27:	49 0f af c9          	imul   %r9,%rcx
    9e2b:	66 45 0f af d9       	imul   %r9w,%r11w
    9e30:	66 45 0f af d1       	imul   %r9w,%r10w
    9e35:	49 0f af d9          	imul   %r9,%rbx
    9e39:	66 45 0f af f1       	imul   %r9w,%r14w
    9e3e:	66 45 0f af e9       	imul   %r9w,%r13w
    9e43:	49 0f af c1          	imul   %r9,%rax
    9e47:	66 45 0f af d9       	imul   %r9w,%r11w
    9e4c:	66 45 0f af d1       	imul   %r9w,%r10w
    9e51:	49 0f af f1          	imul   %r9,%rsi
    9e55:	66 45 0f af f1       	imul   %r9w,%r14w
    9e5a:	66 45 0f af e9       	imul   %r9w,%r13w
    9e5f:	49 0f af d1          	imul   %r9,%rdx
    9e63:	66 45 0f af d9       	imul   %r9w,%r11w
    9e68:	66 45 0f af d1       	imul   %r9w,%r10w
    9e6d:	4c 29 cf             	sub    %r9,%rdi
    9e70:	7d 8b                	jge    9dfd <mixmul16mul64_21_loop>
    9e72:	41 5f                	pop    %r15
    9e74:	41 5e                	pop    %r14
    9e76:	41 5d                	pop    %r13
    9e78:	41 5c                	pop    %r12
    9e7a:	41 5b                	pop    %r11
    9e7c:	41 5a                	pop    %r10
    9e7e:	41 59                	pop    %r9
    9e80:	41 58                	pop    %r8
    9e82:	5e                   	pop    %rsi
    9e83:	5a                   	pop    %rdx
    9e84:	59                   	pop    %rcx
    9e85:	5b                   	pop    %rbx
    9e86:	c3                   	retq   

0000000000009e87 <spacedstorescalar>:
    9e87:	53                   	push   %rbx
    9e88:	51                   	push   %rcx
    9e89:	41 50                	push   %r8
    9e8b:	41 51                	push   %r9
    9e8d:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

0000000000009e94 <spacedstorescalar_loop>:
    9e94:	48 89 3e             	mov    %rdi,(%rsi)
    9e97:	48 89 7e 40          	mov    %rdi,0x40(%rsi)
    9e9b:	48 89 be 80 00 00 00 	mov    %rdi,0x80(%rsi)
    9ea2:	48 89 be c0 00 00 00 	mov    %rdi,0xc0(%rsi)
    9ea9:	48 89 be 00 01 00 00 	mov    %rdi,0x100(%rsi)
    9eb0:	48 89 be 40 01 00 00 	mov    %rdi,0x140(%rsi)
    9eb7:	48 89 be 80 01 00 00 	mov    %rdi,0x180(%rsi)
    9ebe:	48 89 be c0 01 00 00 	mov    %rdi,0x1c0(%rsi)
    9ec5:	48 89 be 00 02 00 00 	mov    %rdi,0x200(%rsi)
    9ecc:	48 89 be 40 02 00 00 	mov    %rdi,0x240(%rsi)
    9ed3:	48 89 be 80 02 00 00 	mov    %rdi,0x280(%rsi)
    9eda:	48 89 be c0 02 00 00 	mov    %rdi,0x2c0(%rsi)
    9ee1:	48 89 be 00 03 00 00 	mov    %rdi,0x300(%rsi)
    9ee8:	48 89 be 40 03 00 00 	mov    %rdi,0x340(%rsi)
    9eef:	48 89 be 80 03 00 00 	mov    %rdi,0x380(%rsi)
    9ef6:	48 89 be c0 03 00 00 	mov    %rdi,0x3c0(%rsi)
    9efd:	48 89 be 00 04 00 00 	mov    %rdi,0x400(%rsi)
    9f04:	48 89 be 40 04 00 00 	mov    %rdi,0x440(%rsi)
    9f0b:	48 89 be 80 04 00 00 	mov    %rdi,0x480(%rsi)
    9f12:	48 89 be c0 04 00 00 	mov    %rdi,0x4c0(%rsi)
    9f19:	4c 29 cf             	sub    %r9,%rdi
    9f1c:	0f 85 72 ff ff ff    	jne    9e94 <spacedstorescalar_loop>
    9f22:	41 59                	pop    %r9
    9f24:	41 58                	pop    %r8
    9f26:	59                   	pop    %rcx
    9f27:	5b                   	pop    %rbx
    9f28:	c3                   	retq   

0000000000009f29 <spacedload128>:
    9f29:	53                   	push   %rbx
    9f2a:	51                   	push   %rcx
    9f2b:	41 50                	push   %r8
    9f2d:	41 51                	push   %r9
    9f2f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

0000000000009f36 <spacedload128_loop>:
    9f36:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    9f3b:	66 44 0f 6f 5e 40    	movdqa 0x40(%rsi),%xmm11
    9f41:	66 44 0f 6f a6 80 00 	movdqa 0x80(%rsi),%xmm12
    9f48:	00 00 
    9f4a:	66 44 0f 6f ae c0 00 	movdqa 0xc0(%rsi),%xmm13
    9f51:	00 00 
    9f53:	66 44 0f 6f b6 00 01 	movdqa 0x100(%rsi),%xmm14
    9f5a:	00 00 
    9f5c:	66 44 0f 6f 96 40 01 	movdqa 0x140(%rsi),%xmm10
    9f63:	00 00 
    9f65:	66 44 0f 6f 9e 80 01 	movdqa 0x180(%rsi),%xmm11
    9f6c:	00 00 
    9f6e:	66 44 0f 6f a6 c0 01 	movdqa 0x1c0(%rsi),%xmm12
    9f75:	00 00 
    9f77:	66 44 0f 6f ae 00 02 	movdqa 0x200(%rsi),%xmm13
    9f7e:	00 00 
    9f80:	66 44 0f 6f b6 40 02 	movdqa 0x240(%rsi),%xmm14
    9f87:	00 00 
    9f89:	66 44 0f 6f 96 80 02 	movdqa 0x280(%rsi),%xmm10
    9f90:	00 00 
    9f92:	66 44 0f 6f 9e c0 02 	movdqa 0x2c0(%rsi),%xmm11
    9f99:	00 00 
    9f9b:	66 44 0f 6f a6 00 03 	movdqa 0x300(%rsi),%xmm12
    9fa2:	00 00 
    9fa4:	66 44 0f 6f ae 40 03 	movdqa 0x340(%rsi),%xmm13
    9fab:	00 00 
    9fad:	66 44 0f 6f b6 80 03 	movdqa 0x380(%rsi),%xmm14
    9fb4:	00 00 
    9fb6:	66 44 0f 6f 96 c0 03 	movdqa 0x3c0(%rsi),%xmm10
    9fbd:	00 00 
    9fbf:	66 44 0f 6f 9e 00 04 	movdqa 0x400(%rsi),%xmm11
    9fc6:	00 00 
    9fc8:	66 44 0f 6f a6 40 04 	movdqa 0x440(%rsi),%xmm12
    9fcf:	00 00 
    9fd1:	66 44 0f 6f ae 80 04 	movdqa 0x480(%rsi),%xmm13
    9fd8:	00 00 
    9fda:	66 44 0f 6f b6 c0 04 	movdqa 0x4c0(%rsi),%xmm14
    9fe1:	00 00 
    9fe3:	4c 29 cf             	sub    %r9,%rdi
    9fe6:	0f 85 4a ff ff ff    	jne    9f36 <spacedload128_loop>
    9fec:	41 59                	pop    %r9
    9fee:	41 58                	pop    %r8
    9ff0:	59                   	pop    %rcx
    9ff1:	5b                   	pop    %rbx
    9ff2:	c3                   	retq   

0000000000009ff3 <load128>:
    9ff3:	53                   	push   %rbx
    9ff4:	51                   	push   %rcx
    9ff5:	41 50                	push   %r8
    9ff7:	41 51                	push   %r9
    9ff9:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a000 <load128_loop>:
    a000:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    a005:	66 44 0f 6f 1e       	movdqa (%rsi),%xmm11
    a00a:	66 44 0f 6f 26       	movdqa (%rsi),%xmm12
    a00f:	66 44 0f 6f 2e       	movdqa (%rsi),%xmm13
    a014:	66 44 0f 6f 36       	movdqa (%rsi),%xmm14
    a019:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    a01e:	66 44 0f 6f 1e       	movdqa (%rsi),%xmm11
    a023:	66 44 0f 6f 26       	movdqa (%rsi),%xmm12
    a028:	66 44 0f 6f 2e       	movdqa (%rsi),%xmm13
    a02d:	66 44 0f 6f 36       	movdqa (%rsi),%xmm14
    a032:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    a037:	66 44 0f 6f 1e       	movdqa (%rsi),%xmm11
    a03c:	66 44 0f 6f 26       	movdqa (%rsi),%xmm12
    a041:	66 44 0f 6f 2e       	movdqa (%rsi),%xmm13
    a046:	66 44 0f 6f 36       	movdqa (%rsi),%xmm14
    a04b:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    a050:	66 44 0f 6f 1e       	movdqa (%rsi),%xmm11
    a055:	66 44 0f 6f 26       	movdqa (%rsi),%xmm12
    a05a:	66 44 0f 6f 2e       	movdqa (%rsi),%xmm13
    a05f:	66 44 0f 6f 36       	movdqa (%rsi),%xmm14
    a064:	4c 29 cf             	sub    %r9,%rdi
    a067:	75 97                	jne    a000 <load128_loop>
    a069:	41 59                	pop    %r9
    a06b:	41 58                	pop    %r8
    a06d:	59                   	pop    %rcx
    a06e:	5b                   	pop    %rbx
    a06f:	c3                   	retq   

000000000000a070 <load256>:
    a070:	53                   	push   %rbx
    a071:	51                   	push   %rcx
    a072:	41 50                	push   %r8
    a074:	41 51                	push   %r9
    a076:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a07d <load256_loop>:
    a07d:	c5 7c 28 16          	vmovaps (%rsi),%ymm10
    a081:	c5 7c 28 1e          	vmovaps (%rsi),%ymm11
    a085:	c5 7c 28 26          	vmovaps (%rsi),%ymm12
    a089:	c5 7c 28 2e          	vmovaps (%rsi),%ymm13
    a08d:	c5 7c 28 36          	vmovaps (%rsi),%ymm14
    a091:	c5 7c 28 16          	vmovaps (%rsi),%ymm10
    a095:	c5 7c 28 1e          	vmovaps (%rsi),%ymm11
    a099:	c5 7c 28 26          	vmovaps (%rsi),%ymm12
    a09d:	c5 7c 28 2e          	vmovaps (%rsi),%ymm13
    a0a1:	c5 7c 28 36          	vmovaps (%rsi),%ymm14
    a0a5:	c5 7c 28 16          	vmovaps (%rsi),%ymm10
    a0a9:	c5 7c 28 1e          	vmovaps (%rsi),%ymm11
    a0ad:	c5 7c 28 26          	vmovaps (%rsi),%ymm12
    a0b1:	c5 7c 28 2e          	vmovaps (%rsi),%ymm13
    a0b5:	c5 7c 28 36          	vmovaps (%rsi),%ymm14
    a0b9:	c5 7c 28 16          	vmovaps (%rsi),%ymm10
    a0bd:	c5 7c 28 1e          	vmovaps (%rsi),%ymm11
    a0c1:	c5 7c 28 26          	vmovaps (%rsi),%ymm12
    a0c5:	c5 7c 28 2e          	vmovaps (%rsi),%ymm13
    a0c9:	c5 7c 28 36          	vmovaps (%rsi),%ymm14
    a0cd:	4c 29 cf             	sub    %r9,%rdi
    a0d0:	75 ab                	jne    a07d <load256_loop>
    a0d2:	41 59                	pop    %r9
    a0d4:	41 58                	pop    %r8
    a0d6:	59                   	pop    %rcx
    a0d7:	5b                   	pop    %rbx
    a0d8:	c3                   	retq   

000000000000a0d9 <load512>:
    a0d9:	53                   	push   %rbx
    a0da:	51                   	push   %rcx
    a0db:	41 50                	push   %r8
    a0dd:	41 51                	push   %r9
    a0df:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a0e6 <load512_loop>:
    a0e6:	62 71 7c 48 28 16    	vmovaps (%rsi),%zmm10
    a0ec:	62 71 7c 48 28 1e    	vmovaps (%rsi),%zmm11
    a0f2:	62 71 7c 48 28 26    	vmovaps (%rsi),%zmm12
    a0f8:	62 71 7c 48 28 2e    	vmovaps (%rsi),%zmm13
    a0fe:	62 71 7c 48 28 36    	vmovaps (%rsi),%zmm14
    a104:	62 71 7c 48 28 16    	vmovaps (%rsi),%zmm10
    a10a:	62 71 7c 48 28 1e    	vmovaps (%rsi),%zmm11
    a110:	62 71 7c 48 28 26    	vmovaps (%rsi),%zmm12
    a116:	62 71 7c 48 28 2e    	vmovaps (%rsi),%zmm13
    a11c:	62 71 7c 48 28 36    	vmovaps (%rsi),%zmm14
    a122:	62 71 7c 48 28 16    	vmovaps (%rsi),%zmm10
    a128:	62 71 7c 48 28 1e    	vmovaps (%rsi),%zmm11
    a12e:	62 71 7c 48 28 26    	vmovaps (%rsi),%zmm12
    a134:	62 71 7c 48 28 2e    	vmovaps (%rsi),%zmm13
    a13a:	62 71 7c 48 28 36    	vmovaps (%rsi),%zmm14
    a140:	62 71 7c 48 28 16    	vmovaps (%rsi),%zmm10
    a146:	62 71 7c 48 28 1e    	vmovaps (%rsi),%zmm11
    a14c:	62 71 7c 48 28 26    	vmovaps (%rsi),%zmm12
    a152:	62 71 7c 48 28 2e    	vmovaps (%rsi),%zmm13
    a158:	62 71 7c 48 28 36    	vmovaps (%rsi),%zmm14
    a15e:	4c 29 cf             	sub    %r9,%rdi
    a161:	75 83                	jne    a0e6 <load512_loop>
    a163:	41 59                	pop    %r9
    a165:	41 58                	pop    %r8
    a167:	59                   	pop    %rcx
    a168:	5b                   	pop    %rbx
    a169:	c3                   	retq   

000000000000a16a <store128>:
    a16a:	53                   	push   %rbx
    a16b:	51                   	push   %rcx
    a16c:	41 50                	push   %r8
    a16e:	41 51                	push   %r9
    a170:	66 44 0f 6f 16       	movdqa (%rsi),%xmm10
    a175:	66 45 0f 6f da       	movdqa %xmm10,%xmm11
    a17a:	66 45 0f 6f e2       	movdqa %xmm10,%xmm12
    a17f:	66 45 0f 6f ea       	movdqa %xmm10,%xmm13
    a184:	66 45 0f 6f f2       	movdqa %xmm10,%xmm14
    a189:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a190 <store128_loop>:
    a190:	66 44 0f 7f 12       	movdqa %xmm10,(%rdx)
    a195:	66 44 0f 7f 1a       	movdqa %xmm11,(%rdx)
    a19a:	66 44 0f 7f 22       	movdqa %xmm12,(%rdx)
    a19f:	66 44 0f 7f 2a       	movdqa %xmm13,(%rdx)
    a1a4:	66 44 0f 7f 32       	movdqa %xmm14,(%rdx)
    a1a9:	66 44 0f 7f 12       	movdqa %xmm10,(%rdx)
    a1ae:	66 44 0f 7f 1a       	movdqa %xmm11,(%rdx)
    a1b3:	66 44 0f 7f 22       	movdqa %xmm12,(%rdx)
    a1b8:	66 44 0f 7f 2a       	movdqa %xmm13,(%rdx)
    a1bd:	66 44 0f 7f 32       	movdqa %xmm14,(%rdx)
    a1c2:	66 44 0f 7f 12       	movdqa %xmm10,(%rdx)
    a1c7:	66 44 0f 7f 1a       	movdqa %xmm11,(%rdx)
    a1cc:	66 44 0f 7f 22       	movdqa %xmm12,(%rdx)
    a1d1:	66 44 0f 7f 2a       	movdqa %xmm13,(%rdx)
    a1d6:	66 44 0f 7f 32       	movdqa %xmm14,(%rdx)
    a1db:	66 44 0f 7f 12       	movdqa %xmm10,(%rdx)
    a1e0:	66 44 0f 7f 1a       	movdqa %xmm11,(%rdx)
    a1e5:	66 44 0f 7f 22       	movdqa %xmm12,(%rdx)
    a1ea:	66 44 0f 7f 2a       	movdqa %xmm13,(%rdx)
    a1ef:	66 44 0f 7f 32       	movdqa %xmm14,(%rdx)
    a1f4:	4c 29 cf             	sub    %r9,%rdi
    a1f7:	75 97                	jne    a190 <store128_loop>
    a1f9:	41 59                	pop    %r9
    a1fb:	41 58                	pop    %r8
    a1fd:	59                   	pop    %rcx
    a1fe:	5b                   	pop    %rbx
    a1ff:	c3                   	retq   

000000000000a200 <store256>:
    a200:	53                   	push   %rbx
    a201:	51                   	push   %rcx
    a202:	41 50                	push   %r8
    a204:	41 51                	push   %r9
    a206:	c5 7c 28 16          	vmovaps (%rsi),%ymm10
    a20a:	c4 41 7c 28 da       	vmovaps %ymm10,%ymm11
    a20f:	c4 41 7c 28 e2       	vmovaps %ymm10,%ymm12
    a214:	c4 41 7c 28 ea       	vmovaps %ymm10,%ymm13
    a219:	c4 41 7c 28 f2       	vmovaps %ymm10,%ymm14
    a21e:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a225 <store256_loop>:
    a225:	c5 7c 29 12          	vmovaps %ymm10,(%rdx)
    a229:	c5 7c 29 1a          	vmovaps %ymm11,(%rdx)
    a22d:	c5 7c 29 22          	vmovaps %ymm12,(%rdx)
    a231:	c5 7c 29 2a          	vmovaps %ymm13,(%rdx)
    a235:	c5 7c 29 32          	vmovaps %ymm14,(%rdx)
    a239:	c5 7c 29 12          	vmovaps %ymm10,(%rdx)
    a23d:	c5 7c 29 1a          	vmovaps %ymm11,(%rdx)
    a241:	c5 7c 29 22          	vmovaps %ymm12,(%rdx)
    a245:	c5 7c 29 2a          	vmovaps %ymm13,(%rdx)
    a249:	c5 7c 29 32          	vmovaps %ymm14,(%rdx)
    a24d:	c5 7c 29 12          	vmovaps %ymm10,(%rdx)
    a251:	c5 7c 29 1a          	vmovaps %ymm11,(%rdx)
    a255:	c5 7c 29 22          	vmovaps %ymm12,(%rdx)
    a259:	c5 7c 29 2a          	vmovaps %ymm13,(%rdx)
    a25d:	c5 7c 29 32          	vmovaps %ymm14,(%rdx)
    a261:	c5 7c 29 12          	vmovaps %ymm10,(%rdx)
    a265:	c5 7c 29 1a          	vmovaps %ymm11,(%rdx)
    a269:	c5 7c 29 22          	vmovaps %ymm12,(%rdx)
    a26d:	c5 7c 29 2a          	vmovaps %ymm13,(%rdx)
    a271:	c5 7c 29 32          	vmovaps %ymm14,(%rdx)
    a275:	4c 29 cf             	sub    %r9,%rdi
    a278:	75 ab                	jne    a225 <store256_loop>
    a27a:	41 59                	pop    %r9
    a27c:	41 58                	pop    %r8
    a27e:	59                   	pop    %rcx
    a27f:	5b                   	pop    %rbx
    a280:	c3                   	retq   

000000000000a281 <store512>:
    a281:	53                   	push   %rbx
    a282:	51                   	push   %rcx
    a283:	41 50                	push   %r8
    a285:	41 51                	push   %r9
    a287:	62 71 7c 48 28 16    	vmovaps (%rsi),%zmm10
    a28d:	62 51 7c 48 28 da    	vmovaps %zmm10,%zmm11
    a293:	62 51 7c 48 28 e2    	vmovaps %zmm10,%zmm12
    a299:	62 51 7c 48 28 ea    	vmovaps %zmm10,%zmm13
    a29f:	62 51 7c 48 28 f2    	vmovaps %zmm10,%zmm14
    a2a5:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9

000000000000a2ac <store512_loop>:
    a2ac:	62 71 7c 48 29 12    	vmovaps %zmm10,(%rdx)
    a2b2:	62 71 7c 48 29 1a    	vmovaps %zmm11,(%rdx)
    a2b8:	62 71 7c 48 29 22    	vmovaps %zmm12,(%rdx)
    a2be:	62 71 7c 48 29 2a    	vmovaps %zmm13,(%rdx)
    a2c4:	62 71 7c 48 29 32    	vmovaps %zmm14,(%rdx)
    a2ca:	62 71 7c 48 29 12    	vmovaps %zmm10,(%rdx)
    a2d0:	62 71 7c 48 29 1a    	vmovaps %zmm11,(%rdx)
    a2d6:	62 71 7c 48 29 22    	vmovaps %zmm12,(%rdx)
    a2dc:	62 71 7c 48 29 2a    	vmovaps %zmm13,(%rdx)
    a2e2:	62 71 7c 48 29 32    	vmovaps %zmm14,(%rdx)
    a2e8:	62 71 7c 48 29 12    	vmovaps %zmm10,(%rdx)
    a2ee:	62 71 7c 48 29 1a    	vmovaps %zmm11,(%rdx)
    a2f4:	62 71 7c 48 29 22    	vmovaps %zmm12,(%rdx)
    a2fa:	62 71 7c 48 29 2a    	vmovaps %zmm13,(%rdx)
    a300:	62 71 7c 48 29 32    	vmovaps %zmm14,(%rdx)
    a306:	62 71 7c 48 29 12    	vmovaps %zmm10,(%rdx)
    a30c:	62 71 7c 48 29 1a    	vmovaps %zmm11,(%rdx)
    a312:	62 71 7c 48 29 22    	vmovaps %zmm12,(%rdx)
    a318:	62 71 7c 48 29 2a    	vmovaps %zmm13,(%rdx)
    a31e:	62 71 7c 48 29 32    	vmovaps %zmm14,(%rdx)
    a324:	4c 29 cf             	sub    %r9,%rdi
    a327:	75 83                	jne    a2ac <store512_loop>
    a329:	41 59                	pop    %r9
    a32b:	41 58                	pop    %r8
    a32d:	59                   	pop    %rcx
    a32e:	5b                   	pop    %rbx
    a32f:	c3                   	retq   

000000000000a330 <pdeptest>:
    a330:	53                   	push   %rbx
    a331:	51                   	push   %rcx
    a332:	41 50                	push   %r8
    a334:	41 51                	push   %r9
    a336:	41 52                	push   %r10
    a338:	41 53                	push   %r11
    a33a:	41 54                	push   %r12
    a33c:	41 55                	push   %r13
    a33e:	41 56                	push   %r14
    a340:	41 57                	push   %r15
    a342:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a349:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a350:	48 31 db             	xor    %rbx,%rbx
    a353:	48 31 c9             	xor    %rcx,%rcx
    a356:	4d 31 d2             	xor    %r10,%r10
    a359:	4d 31 db             	xor    %r11,%r11
    a35c:	4d 31 e4             	xor    %r12,%r12
    a35f:	4d 31 ed             	xor    %r13,%r13
    a362:	4d 31 f6             	xor    %r14,%r14
    a365:	4d 31 ff             	xor    %r15,%r15

000000000000a368 <pdeptest_loop>:
    a368:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a36d:	c4 42 8b f5 f0       	pdep   %r8,%r14,%r14
    a372:	c4 42 93 f5 e8       	pdep   %r8,%r13,%r13
    a377:	c4 42 9b f5 e0       	pdep   %r8,%r12,%r12
    a37c:	c4 42 a3 f5 d8       	pdep   %r8,%r11,%r11
    a381:	c4 42 ab f5 d0       	pdep   %r8,%r10,%r10
    a386:	c4 c2 f3 f5 c8       	pdep   %r8,%rcx,%rcx
    a38b:	c4 c2 e3 f5 d8       	pdep   %r8,%rbx,%rbx
    a390:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a395:	c4 42 8b f5 f0       	pdep   %r8,%r14,%r14
    a39a:	c4 42 93 f5 e8       	pdep   %r8,%r13,%r13
    a39f:	c4 42 9b f5 e0       	pdep   %r8,%r12,%r12
    a3a4:	c4 42 a3 f5 d8       	pdep   %r8,%r11,%r11
    a3a9:	c4 42 ab f5 d0       	pdep   %r8,%r10,%r10
    a3ae:	c4 c2 f3 f5 c8       	pdep   %r8,%rcx,%rcx
    a3b3:	c4 c2 e3 f5 d8       	pdep   %r8,%rbx,%rbx
    a3b8:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a3bd:	c4 42 8b f5 f0       	pdep   %r8,%r14,%r14
    a3c2:	c4 42 93 f5 e8       	pdep   %r8,%r13,%r13
    a3c7:	c4 42 9b f5 e0       	pdep   %r8,%r12,%r12
    a3cc:	4c 29 cf             	sub    %r9,%rdi
    a3cf:	75 97                	jne    a368 <pdeptest_loop>
    a3d1:	41 5f                	pop    %r15
    a3d3:	41 5e                	pop    %r14
    a3d5:	41 5d                	pop    %r13
    a3d7:	41 5c                	pop    %r12
    a3d9:	41 5b                	pop    %r11
    a3db:	41 5a                	pop    %r10
    a3dd:	41 59                	pop    %r9
    a3df:	41 58                	pop    %r8
    a3e1:	59                   	pop    %rcx
    a3e2:	5b                   	pop    %rbx
    a3e3:	c3                   	retq   

000000000000a3e4 <pdepmultest>:
    a3e4:	53                   	push   %rbx
    a3e5:	51                   	push   %rcx
    a3e6:	56                   	push   %rsi
    a3e7:	41 50                	push   %r8
    a3e9:	41 51                	push   %r9
    a3eb:	41 52                	push   %r10
    a3ed:	41 53                	push   %r11
    a3ef:	41 54                	push   %r12
    a3f1:	41 55                	push   %r13
    a3f3:	41 56                	push   %r14
    a3f5:	41 57                	push   %r15
    a3f7:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a3fe:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a405:	48 31 db             	xor    %rbx,%rbx
    a408:	48 31 c9             	xor    %rcx,%rcx
    a40b:	48 31 f6             	xor    %rsi,%rsi
    a40e:	4d 31 d2             	xor    %r10,%r10
    a411:	4d 31 db             	xor    %r11,%r11
    a414:	4d 31 e4             	xor    %r12,%r12
    a417:	4d 31 ed             	xor    %r13,%r13
    a41a:	4d 31 f6             	xor    %r14,%r14
    a41d:	4d 31 ff             	xor    %r15,%r15

000000000000a420 <pdepmultest_loop>:
    a420:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a425:	4d 0f af f1          	imul   %r9,%r14
    a429:	c4 42 93 f5 e8       	pdep   %r8,%r13,%r13
    a42e:	4d 0f af e1          	imul   %r9,%r12
    a432:	c4 42 a3 f5 d8       	pdep   %r8,%r11,%r11
    a437:	4d 0f af d1          	imul   %r9,%r10
    a43b:	c4 c2 f3 f5 c8       	pdep   %r8,%rcx,%rcx
    a440:	49 0f af d9          	imul   %r9,%rbx
    a444:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a449:	49 0f af f1          	imul   %r9,%rsi
    a44d:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a452:	4d 0f af f1          	imul   %r9,%r14
    a456:	c4 42 93 f5 e8       	pdep   %r8,%r13,%r13
    a45b:	4d 0f af e1          	imul   %r9,%r12
    a45f:	c4 42 a3 f5 d8       	pdep   %r8,%r11,%r11
    a464:	4d 0f af d1          	imul   %r9,%r10
    a468:	c4 c2 f3 f5 c8       	pdep   %r8,%rcx,%rcx
    a46d:	49 0f af d9          	imul   %r9,%rbx
    a471:	c4 42 83 f5 f8       	pdep   %r8,%r15,%r15
    a476:	49 0f af f1          	imul   %r9,%rsi
    a47a:	4c 29 cf             	sub    %r9,%rdi
    a47d:	75 a1                	jne    a420 <pdepmultest_loop>
    a47f:	41 5f                	pop    %r15
    a481:	41 5e                	pop    %r14
    a483:	41 5d                	pop    %r13
    a485:	41 5c                	pop    %r12
    a487:	41 5b                	pop    %r11
    a489:	41 5a                	pop    %r10
    a48b:	41 59                	pop    %r9
    a48d:	41 58                	pop    %r8
    a48f:	5e                   	pop    %rsi
    a490:	59                   	pop    %rcx
    a491:	5b                   	pop    %rbx
    a492:	c3                   	retq   

000000000000a493 <pexttest>:
    a493:	53                   	push   %rbx
    a494:	51                   	push   %rcx
    a495:	41 50                	push   %r8
    a497:	41 51                	push   %r9
    a499:	41 52                	push   %r10
    a49b:	41 53                	push   %r11
    a49d:	41 54                	push   %r12
    a49f:	41 55                	push   %r13
    a4a1:	41 56                	push   %r14
    a4a3:	41 57                	push   %r15
    a4a5:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a4ac:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a4b3:	48 31 db             	xor    %rbx,%rbx
    a4b6:	48 31 c9             	xor    %rcx,%rcx
    a4b9:	4d 31 d2             	xor    %r10,%r10
    a4bc:	4d 31 db             	xor    %r11,%r11
    a4bf:	4d 31 e4             	xor    %r12,%r12
    a4c2:	4d 31 ed             	xor    %r13,%r13
    a4c5:	4d 31 f6             	xor    %r14,%r14
    a4c8:	4d 31 ff             	xor    %r15,%r15

000000000000a4cb <pexttest_loop>:
    a4cb:	c4 42 82 f5 f8       	pext   %r8,%r15,%r15
    a4d0:	c4 42 8a f5 f0       	pext   %r8,%r14,%r14
    a4d5:	c4 42 92 f5 e8       	pext   %r8,%r13,%r13
    a4da:	c4 42 9a f5 e0       	pext   %r8,%r12,%r12
    a4df:	c4 42 a2 f5 d8       	pext   %r8,%r11,%r11
    a4e4:	c4 42 aa f5 d0       	pext   %r8,%r10,%r10
    a4e9:	c4 c2 f2 f5 c8       	pext   %r8,%rcx,%rcx
    a4ee:	c4 c2 e2 f5 d8       	pext   %r8,%rbx,%rbx
    a4f3:	c4 42 82 f5 f8       	pext   %r8,%r15,%r15
    a4f8:	c4 42 8a f5 f0       	pext   %r8,%r14,%r14
    a4fd:	c4 42 92 f5 e8       	pext   %r8,%r13,%r13
    a502:	c4 42 9a f5 e0       	pext   %r8,%r12,%r12
    a507:	c4 42 a2 f5 d8       	pext   %r8,%r11,%r11
    a50c:	c4 42 aa f5 d0       	pext   %r8,%r10,%r10
    a511:	c4 c2 f2 f5 c8       	pext   %r8,%rcx,%rcx
    a516:	c4 c2 e2 f5 d8       	pext   %r8,%rbx,%rbx
    a51b:	c4 42 82 f5 f8       	pext   %r8,%r15,%r15
    a520:	c4 42 8a f5 f0       	pext   %r8,%r14,%r14
    a525:	c4 42 92 f5 e8       	pext   %r8,%r13,%r13
    a52a:	c4 42 9a f5 e0       	pext   %r8,%r12,%r12
    a52f:	4c 29 cf             	sub    %r9,%rdi
    a532:	75 97                	jne    a4cb <pexttest_loop>
    a534:	41 5f                	pop    %r15
    a536:	41 5e                	pop    %r14
    a538:	41 5d                	pop    %r13
    a53a:	41 5c                	pop    %r12
    a53c:	41 5b                	pop    %r11
    a53e:	41 5a                	pop    %r10
    a540:	41 59                	pop    %r9
    a542:	41 58                	pop    %r8
    a544:	59                   	pop    %rcx
    a545:	5b                   	pop    %rbx
    a546:	c3                   	retq   

000000000000a547 <depmovtest>:
    a547:	53                   	push   %rbx
    a548:	41 50                	push   %r8
    a54a:	41 51                	push   %r9
    a54c:	41 57                	push   %r15
    a54e:	41 56                	push   %r14
    a550:	41 55                	push   %r13
    a552:	41 54                	push   %r12
    a554:	41 53                	push   %r11
    a556:	41 52                	push   %r10
    a558:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a55f:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a566:	48 31 db             	xor    %rbx,%rbx

000000000000a569 <depmovtest_loop>:
    a569:	4d 89 fc             	mov    %r15,%r12
    a56c:	4d 89 e6             	mov    %r12,%r14
    a56f:	4d 89 f5             	mov    %r14,%r13
    a572:	4d 89 eb             	mov    %r13,%r11
    a575:	4d 89 df             	mov    %r11,%r15
    a578:	4d 89 fc             	mov    %r15,%r12
    a57b:	4d 89 e6             	mov    %r12,%r14
    a57e:	4d 89 f5             	mov    %r14,%r13
    a581:	4d 89 eb             	mov    %r13,%r11
    a584:	4d 89 df             	mov    %r11,%r15
    a587:	4d 89 fc             	mov    %r15,%r12
    a58a:	4d 89 e6             	mov    %r12,%r14
    a58d:	4d 89 f5             	mov    %r14,%r13
    a590:	4d 89 eb             	mov    %r13,%r11
    a593:	4d 89 df             	mov    %r11,%r15
    a596:	4d 89 fc             	mov    %r15,%r12
    a599:	4d 89 e6             	mov    %r12,%r14
    a59c:	4d 89 f5             	mov    %r14,%r13
    a59f:	4d 89 eb             	mov    %r13,%r11
    a5a2:	4d 89 df             	mov    %r11,%r15
    a5a5:	4c 29 cf             	sub    %r9,%rdi
    a5a8:	75 bf                	jne    a569 <depmovtest_loop>
    a5aa:	41 5a                	pop    %r10
    a5ac:	41 5b                	pop    %r11
    a5ae:	41 5c                	pop    %r12
    a5b0:	41 5d                	pop    %r13
    a5b2:	41 5e                	pop    %r14
    a5b4:	41 5f                	pop    %r15
    a5b6:	41 59                	pop    %r9
    a5b8:	41 58                	pop    %r8
    a5ba:	5b                   	pop    %rbx
    a5bb:	c3                   	retq   

000000000000a5bc <indepmovtest>:
    a5bc:	53                   	push   %rbx
    a5bd:	51                   	push   %rcx
    a5be:	41 50                	push   %r8
    a5c0:	41 51                	push   %r9
    a5c2:	41 57                	push   %r15
    a5c4:	41 56                	push   %r14
    a5c6:	41 55                	push   %r13
    a5c8:	41 54                	push   %r12
    a5ca:	41 53                	push   %r11
    a5cc:	41 52                	push   %r10
    a5ce:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a5d5:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a5dc:	48 31 db             	xor    %rbx,%rbx

000000000000a5df <indepmovtest_loop>:
    a5df:	4d 89 d7             	mov    %r10,%r15
    a5e2:	4d 89 de             	mov    %r11,%r14
    a5e5:	4d 89 e5             	mov    %r12,%r13
    a5e8:	49 89 c7             	mov    %rax,%r15
    a5eb:	49 89 ce             	mov    %rcx,%r14
    a5ee:	4d 89 d7             	mov    %r10,%r15
    a5f1:	4d 89 de             	mov    %r11,%r14
    a5f4:	4d 89 e5             	mov    %r12,%r13
    a5f7:	49 89 c7             	mov    %rax,%r15
    a5fa:	49 89 ce             	mov    %rcx,%r14
    a5fd:	4d 89 d7             	mov    %r10,%r15
    a600:	4d 89 de             	mov    %r11,%r14
    a603:	4d 89 e5             	mov    %r12,%r13
    a606:	49 89 c7             	mov    %rax,%r15
    a609:	49 89 ce             	mov    %rcx,%r14
    a60c:	4d 89 d7             	mov    %r10,%r15
    a60f:	4d 89 de             	mov    %r11,%r14
    a612:	4d 89 e5             	mov    %r12,%r13
    a615:	49 89 c7             	mov    %rax,%r15
    a618:	49 89 ce             	mov    %rcx,%r14
    a61b:	4c 29 cf             	sub    %r9,%rdi
    a61e:	75 bf                	jne    a5df <indepmovtest_loop>
    a620:	41 5a                	pop    %r10
    a622:	41 5b                	pop    %r11
    a624:	41 5c                	pop    %r12
    a626:	41 5d                	pop    %r13
    a628:	41 5e                	pop    %r14
    a62a:	41 5f                	pop    %r15
    a62c:	41 59                	pop    %r9
    a62e:	41 58                	pop    %r8
    a630:	59                   	pop    %rcx
    a631:	5b                   	pop    %rbx
    a632:	c3                   	retq   

000000000000a633 <movzerotest>:
    a633:	53                   	push   %rbx
    a634:	51                   	push   %rcx
    a635:	41 50                	push   %r8
    a637:	41 51                	push   %r9
    a639:	41 57                	push   %r15
    a63b:	41 56                	push   %r14
    a63d:	41 55                	push   %r13
    a63f:	41 54                	push   %r12
    a641:	41 53                	push   %r11
    a643:	41 52                	push   %r10
    a645:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a64c:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a653:	48 31 db             	xor    %rbx,%rbx

000000000000a656 <movzerotest_loop>:
    a656:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a65d:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a664:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a66b:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a672:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a679:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a680:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a687:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a68e:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a695:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a69c:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6a3:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6aa:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6b1:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6b8:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6bf:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6c6:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6cd:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6d4:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6db:	49 c7 c7 00 00 00 00 	mov    $0x0,%r15
    a6e2:	4c 29 cf             	sub    %r9,%rdi
    a6e5:	0f 85 6b ff ff ff    	jne    a656 <movzerotest_loop>
    a6eb:	41 5a                	pop    %r10
    a6ed:	41 5b                	pop    %r11
    a6ef:	41 5c                	pop    %r12
    a6f1:	41 5d                	pop    %r13
    a6f3:	41 5e                	pop    %r14
    a6f5:	41 5f                	pop    %r15
    a6f7:	41 59                	pop    %r9
    a6f9:	41 58                	pop    %r8
    a6fb:	59                   	pop    %rcx
    a6fc:	5b                   	pop    %rbx
    a6fd:	c3                   	retq   

000000000000a6fe <xorzerotest>:
    a6fe:	53                   	push   %rbx
    a6ff:	51                   	push   %rcx
    a700:	41 50                	push   %r8
    a702:	41 51                	push   %r9
    a704:	41 57                	push   %r15
    a706:	41 56                	push   %r14
    a708:	41 55                	push   %r13
    a70a:	41 54                	push   %r12
    a70c:	41 53                	push   %r11
    a70e:	41 52                	push   %r10
    a710:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a717:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a71e:	48 31 db             	xor    %rbx,%rbx

000000000000a721 <xorzerotest_loop>:
    a721:	4d 31 ff             	xor    %r15,%r15
    a724:	4d 31 ff             	xor    %r15,%r15
    a727:	4d 31 ff             	xor    %r15,%r15
    a72a:	4d 31 ff             	xor    %r15,%r15
    a72d:	4d 31 ff             	xor    %r15,%r15
    a730:	4d 31 ff             	xor    %r15,%r15
    a733:	4d 31 ff             	xor    %r15,%r15
    a736:	4d 31 ff             	xor    %r15,%r15
    a739:	4d 31 ff             	xor    %r15,%r15
    a73c:	4d 31 ff             	xor    %r15,%r15
    a73f:	4d 31 ff             	xor    %r15,%r15
    a742:	4d 31 ff             	xor    %r15,%r15
    a745:	4d 31 ff             	xor    %r15,%r15
    a748:	4d 31 ff             	xor    %r15,%r15
    a74b:	4d 31 ff             	xor    %r15,%r15
    a74e:	4d 31 ff             	xor    %r15,%r15
    a751:	4d 31 ff             	xor    %r15,%r15
    a754:	4d 31 ff             	xor    %r15,%r15
    a757:	4d 31 ff             	xor    %r15,%r15
    a75a:	4d 31 ff             	xor    %r15,%r15
    a75d:	4c 29 cf             	sub    %r9,%rdi
    a760:	75 bf                	jne    a721 <xorzerotest_loop>
    a762:	41 5a                	pop    %r10
    a764:	41 5b                	pop    %r11
    a766:	41 5c                	pop    %r12
    a768:	41 5d                	pop    %r13
    a76a:	41 5e                	pop    %r14
    a76c:	41 5f                	pop    %r15
    a76e:	41 59                	pop    %r9
    a770:	41 58                	pop    %r8
    a772:	59                   	pop    %rcx
    a773:	5b                   	pop    %rbx
    a774:	c3                   	retq   

000000000000a775 <subzerotest>:
    a775:	53                   	push   %rbx
    a776:	51                   	push   %rcx
    a777:	41 50                	push   %r8
    a779:	41 51                	push   %r9
    a77b:	41 57                	push   %r15
    a77d:	41 56                	push   %r14
    a77f:	41 55                	push   %r13
    a781:	41 54                	push   %r12
    a783:	41 53                	push   %r11
    a785:	41 52                	push   %r10
    a787:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a78e:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a795:	48 31 db             	xor    %rbx,%rbx

000000000000a798 <subzerotest_loop>:
    a798:	4d 29 ff             	sub    %r15,%r15
    a79b:	4d 29 ff             	sub    %r15,%r15
    a79e:	4d 29 ff             	sub    %r15,%r15
    a7a1:	4d 29 ff             	sub    %r15,%r15
    a7a4:	4d 29 ff             	sub    %r15,%r15
    a7a7:	4d 29 ff             	sub    %r15,%r15
    a7aa:	4d 29 ff             	sub    %r15,%r15
    a7ad:	4d 29 ff             	sub    %r15,%r15
    a7b0:	4d 29 ff             	sub    %r15,%r15
    a7b3:	4d 29 ff             	sub    %r15,%r15
    a7b6:	4d 29 ff             	sub    %r15,%r15
    a7b9:	4d 29 ff             	sub    %r15,%r15
    a7bc:	4d 29 ff             	sub    %r15,%r15
    a7bf:	4d 29 ff             	sub    %r15,%r15
    a7c2:	4d 29 ff             	sub    %r15,%r15
    a7c5:	4d 29 ff             	sub    %r15,%r15
    a7c8:	4d 29 ff             	sub    %r15,%r15
    a7cb:	4d 29 ff             	sub    %r15,%r15
    a7ce:	4d 29 ff             	sub    %r15,%r15
    a7d1:	4d 29 ff             	sub    %r15,%r15
    a7d4:	4c 29 cf             	sub    %r9,%rdi
    a7d7:	75 bf                	jne    a798 <subzerotest_loop>
    a7d9:	41 5a                	pop    %r10
    a7db:	41 5b                	pop    %r11
    a7dd:	41 5c                	pop    %r12
    a7df:	41 5d                	pop    %r13
    a7e1:	41 5e                	pop    %r14
    a7e3:	41 5f                	pop    %r15
    a7e5:	41 59                	pop    %r9
    a7e7:	41 58                	pop    %r8
    a7e9:	59                   	pop    %rcx
    a7ea:	5b                   	pop    %rbx
    a7eb:	c3                   	retq   

000000000000a7ec <depaddimmtest>:
    a7ec:	53                   	push   %rbx
    a7ed:	51                   	push   %rcx
    a7ee:	41 50                	push   %r8
    a7f0:	41 51                	push   %r9
    a7f2:	41 57                	push   %r15
    a7f4:	41 56                	push   %r14
    a7f6:	41 55                	push   %r13
    a7f8:	41 54                	push   %r12
    a7fa:	41 53                	push   %r11
    a7fc:	41 52                	push   %r10
    a7fe:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a805:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a80c:	48 31 db             	xor    %rbx,%rbx

000000000000a80f <depaddimmtest_loop>:
    a80f:	49 83 c7 01          	add    $0x1,%r15
    a813:	49 83 c7 02          	add    $0x2,%r15
    a817:	49 83 c7 03          	add    $0x3,%r15
    a81b:	49 83 c7 04          	add    $0x4,%r15
    a81f:	49 83 c7 05          	add    $0x5,%r15
    a823:	49 83 c7 06          	add    $0x6,%r15
    a827:	49 83 c7 07          	add    $0x7,%r15
    a82b:	49 83 c7 08          	add    $0x8,%r15
    a82f:	49 83 c7 09          	add    $0x9,%r15
    a833:	49 83 c7 0a          	add    $0xa,%r15
    a837:	49 83 c7 0b          	add    $0xb,%r15
    a83b:	49 83 c7 0c          	add    $0xc,%r15
    a83f:	49 83 c7 0d          	add    $0xd,%r15
    a843:	49 83 c7 0e          	add    $0xe,%r15
    a847:	49 83 c7 0f          	add    $0xf,%r15
    a84b:	49 83 c7 10          	add    $0x10,%r15
    a84f:	49 83 c7 11          	add    $0x11,%r15
    a853:	49 83 c7 12          	add    $0x12,%r15
    a857:	49 83 c7 13          	add    $0x13,%r15
    a85b:	49 83 c7 14          	add    $0x14,%r15
    a85f:	4c 29 cf             	sub    %r9,%rdi
    a862:	75 ab                	jne    a80f <depaddimmtest_loop>
    a864:	41 5a                	pop    %r10
    a866:	41 5b                	pop    %r11
    a868:	41 5c                	pop    %r12
    a86a:	41 5d                	pop    %r13
    a86c:	41 5e                	pop    %r14
    a86e:	41 5f                	pop    %r15
    a870:	41 59                	pop    %r9
    a872:	41 58                	pop    %r8
    a874:	59                   	pop    %rcx
    a875:	5b                   	pop    %rbx
    a876:	c3                   	retq   

000000000000a877 <depinctest>:
    a877:	53                   	push   %rbx
    a878:	51                   	push   %rcx
    a879:	41 50                	push   %r8
    a87b:	41 51                	push   %r9
    a87d:	41 57                	push   %r15
    a87f:	41 56                	push   %r14
    a881:	41 55                	push   %r13
    a883:	41 54                	push   %r12
    a885:	41 53                	push   %r11
    a887:	41 52                	push   %r10
    a889:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a890:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a897:	48 31 db             	xor    %rbx,%rbx

000000000000a89a <depinctest_loop>:
    a89a:	49 ff c7             	inc    %r15
    a89d:	49 ff c7             	inc    %r15
    a8a0:	49 ff c7             	inc    %r15
    a8a3:	49 ff c7             	inc    %r15
    a8a6:	49 ff c7             	inc    %r15
    a8a9:	49 ff c7             	inc    %r15
    a8ac:	49 ff c7             	inc    %r15
    a8af:	49 ff c7             	inc    %r15
    a8b2:	49 ff c7             	inc    %r15
    a8b5:	49 ff c7             	inc    %r15
    a8b8:	49 ff c7             	inc    %r15
    a8bb:	49 ff c7             	inc    %r15
    a8be:	49 ff c7             	inc    %r15
    a8c1:	49 ff c7             	inc    %r15
    a8c4:	49 ff c7             	inc    %r15
    a8c7:	49 ff c7             	inc    %r15
    a8ca:	49 ff c7             	inc    %r15
    a8cd:	49 ff c7             	inc    %r15
    a8d0:	49 ff c7             	inc    %r15
    a8d3:	49 ff c7             	inc    %r15
    a8d6:	4c 29 cf             	sub    %r9,%rdi
    a8d9:	75 bf                	jne    a89a <depinctest_loop>
    a8db:	41 5a                	pop    %r10
    a8dd:	41 5b                	pop    %r11
    a8df:	41 5c                	pop    %r12
    a8e1:	41 5d                	pop    %r13
    a8e3:	41 5e                	pop    %r14
    a8e5:	41 5f                	pop    %r15
    a8e7:	41 59                	pop    %r9
    a8e9:	41 58                	pop    %r8
    a8eb:	59                   	pop    %rcx
    a8ec:	5b                   	pop    %rbx
    a8ed:	c3                   	retq   

000000000000a8ee <depdectest>:
    a8ee:	53                   	push   %rbx
    a8ef:	51                   	push   %rcx
    a8f0:	41 50                	push   %r8
    a8f2:	41 51                	push   %r9
    a8f4:	41 57                	push   %r15
    a8f6:	41 56                	push   %r14
    a8f8:	41 55                	push   %r13
    a8fa:	41 54                	push   %r12
    a8fc:	41 53                	push   %r11
    a8fe:	41 52                	push   %r10
    a900:	49 c7 c0 01 00 00 00 	mov    $0x1,%r8
    a907:	49 c7 c1 14 00 00 00 	mov    $0x14,%r9
    a90e:	4d 31 ff             	xor    %r15,%r15
    a911:	49 f7 d7             	not    %r15
    a914:	48 31 db             	xor    %rbx,%rbx

000000000000a917 <depdectest_loop>:
    a917:	49 ff cf             	dec    %r15
    a91a:	49 ff cf             	dec    %r15
    a91d:	49 ff cf             	dec    %r15
    a920:	49 ff cf             	dec    %r15
    a923:	49 ff cf             	dec    %r15
    a926:	49 ff cf             	dec    %r15
    a929:	49 ff cf             	dec    %r15
    a92c:	49 ff cf             	dec    %r15
    a92f:	49 ff cf             	dec    %r15
    a932:	49 ff cf             	dec    %r15
    a935:	49 ff cf             	dec    %r15
    a938:	49 ff cf             	dec    %r15
    a93b:	49 ff cf             	dec    %r15
    a93e:	49 ff cf             	dec    %r15
    a941:	49 ff cf             	dec    %r15
    a944:	49 ff cf             	dec    %r15
    a947:	49 ff cf             	dec    %r15
    a94a:	49 ff cf             	dec    %r15
    a94d:	49 ff cf             	dec    %r15
    a950:	49 ff cf             	dec    %r15
    a953:	4c 29 cf             	sub    %r9,%rdi
    a956:	75 bf                	jne    a917 <depdectest_loop>
    a958:	41 5a                	pop    %r10
    a95a:	41 5b                	pop    %r11
    a95c:	41 5c                	pop    %r12
    a95e:	41 5d                	pop    %r13
    a960:	41 5e                	pop    %r14
    a962:	41 5f                	pop    %r15
    a964:	41 59                	pop    %r9
    a966:	41 58                	pop    %r8
    a968:	59                   	pop    %rcx
    a969:	5b                   	pop    %rbx
    a96a:	c3                   	retq   
    a96b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

000000000000a970 <load128wrapper>:
    a970:	f3 0f 1e fa          	endbr64 
    a974:	48 8b 35 85 37 00 00 	mov    0x3785(%rip),%rsi        # e100 <intTestArr>
    a97b:	e9 73 f6 ff ff       	jmpq   9ff3 <load128>

000000000000a980 <spacedload128wrapper>:
    a980:	f3 0f 1e fa          	endbr64 
    a984:	48 8b 35 75 37 00 00 	mov    0x3775(%rip),%rsi        # e100 <intTestArr>
    a98b:	e9 99 f5 ff ff       	jmpq   9f29 <spacedload128>

000000000000a990 <spacedstorescalarwrapper>:
    a990:	f3 0f 1e fa          	endbr64 
    a994:	48 8b 35 65 37 00 00 	mov    0x3765(%rip),%rsi        # e100 <intTestArr>
    a99b:	e9 e7 f4 ff ff       	jmpq   9e87 <spacedstorescalar>

000000000000a9a0 <load256wrapper>:
    a9a0:	f3 0f 1e fa          	endbr64 
    a9a4:	48 8d 35 15 37 00 00 	lea    0x3715(%rip),%rsi        # e0c0 <fpTestArr>
    a9ab:	e9 c0 f6 ff ff       	jmpq   a070 <load256>

000000000000a9b0 <load512wrapper>:
    a9b0:	f3 0f 1e fa          	endbr64 
    a9b4:	48 8d 35 05 37 00 00 	lea    0x3705(%rip),%rsi        # e0c0 <fpTestArr>
    a9bb:	e9 19 f7 ff ff       	jmpq   a0d9 <load512>

000000000000a9c0 <store128wrapper>:
    a9c0:	f3 0f 1e fa          	endbr64 
    a9c4:	48 8b 35 35 37 00 00 	mov    0x3735(%rip),%rsi        # e100 <intTestArr>
    a9cb:	48 8d 15 6e 36 00 00 	lea    0x366e(%rip),%rdx        # e040 <intSinkArr>
    a9d2:	e9 93 f7 ff ff       	jmpq   a16a <store128>
    a9d7:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
    a9de:	00 00 

000000000000a9e0 <store256wrapper>:
    a9e0:	f3 0f 1e fa          	endbr64 
    a9e4:	48 8d 15 95 36 00 00 	lea    0x3695(%rip),%rdx        # e080 <fpSinkArr>
    a9eb:	48 8d 35 ce 36 00 00 	lea    0x36ce(%rip),%rsi        # e0c0 <fpTestArr>
    a9f2:	e9 09 f8 ff ff       	jmpq   a200 <store256>
    a9f7:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
    a9fe:	00 00 

000000000000aa00 <store512wrapper>:
    aa00:	f3 0f 1e fa          	endbr64 
    aa04:	48 8d 15 75 36 00 00 	lea    0x3675(%rip),%rdx        # e080 <fpSinkArr>
    aa0b:	48 8d 35 ae 36 00 00 	lea    0x36ae(%rip),%rsi        # e0c0 <fpTestArr>
    aa12:	e9 6a f8 ff ff       	jmpq   a281 <store512>
    aa17:	66 0f 1f 84 00 00 00 	nopw   0x0(%rax,%rax,1)
    aa1e:	00 00 

000000000000aa20 <mixfmaandmem256wrapper>:
    aa20:	f3 0f 1e fa          	endbr64 
    aa24:	48 8d 35 95 36 00 00 	lea    0x3695(%rip),%rsi        # e0c0 <fpTestArr>
    aa2b:	e9 03 e9 ff ff       	jmpq   9333 <mixfmaandmem256>

000000000000aa30 <mixfmaaddmem256wrapper>:
    aa30:	f3 0f 1e fa          	endbr64 
    aa34:	48 8d 35 85 36 00 00 	lea    0x3685(%rip),%rsi        # e0c0 <fpTestArr>
    aa3b:	e9 be e9 ff ff       	jmpq   93fe <mixfmaaddmem256>

000000000000aa40 <measureFunction>:
    aa40:	f3 0f 1e fa          	endbr64 
    aa44:	55                   	push   %rbp
    aa45:	48 89 f5             	mov    %rsi,%rbp
    aa48:	53                   	push   %rbx
    aa49:	48 89 fb             	mov    %rdi,%rbx
    aa4c:	48 83 ec 58          	sub    $0x58,%rsp
    aa50:	f3 0f 11 44 24 0c    	movss  %xmm0,0xc(%rsp)
    aa56:	48 8d 74 24 10       	lea    0x10(%rsp),%rsi
    aa5b:	48 8d 7c 24 20       	lea    0x20(%rsp),%rdi
    aa60:	64 48 8b 04 25 28 00 	mov    %fs:0x28,%rax
    aa67:	00 00 
    aa69:	48 89 44 24 48       	mov    %rax,0x48(%rsp)
    aa6e:	31 c0                	xor    %eax,%eax
    aa70:	e8 db 65 ff ff       	callq  1050 <gettimeofday@plt>
    aa75:	48 89 df             	mov    %rbx,%rdi
    aa78:	ff d5                	callq  *%rbp
    aa7a:	48 8d 74 24 18       	lea    0x18(%rsp),%rsi
    aa7f:	48 8d 7c 24 30       	lea    0x30(%rsp),%rdi
    aa84:	e8 c7 65 ff ff       	callq  1050 <gettimeofday@plt>
    aa89:	48 8b 74 24 38       	mov    0x38(%rsp),%rsi
    aa8e:	48 2b 74 24 28       	sub    0x28(%rsp),%rsi
    aa93:	48 ba cf f7 53 e3 a5 	movabs $0x20c49ba5e353f7cf,%rdx
    aa9a:	9b c4 20 
    aa9d:	48 89 f0             	mov    %rsi,%rax
    aaa0:	48 8b 4c 24 30       	mov    0x30(%rsp),%rcx
    aaa5:	48 2b 4c 24 20       	sub    0x20(%rsp),%rcx
    aaaa:	48 c1 fe 3f          	sar    $0x3f,%rsi
    aaae:	48 f7 ea             	imul   %rdx
    aab1:	48 69 c9 e8 03 00 00 	imul   $0x3e8,%rcx,%rcx
    aab8:	48 c1 fa 07          	sar    $0x7,%rdx
    aabc:	48 29 f2             	sub    %rsi,%rdx
    aabf:	48 01 d1             	add    %rdx,%rcx
    aac2:	78 5c                	js     ab20 <measureFunction+0xe0>
    aac4:	66 0f ef c9          	pxor   %xmm1,%xmm1
    aac8:	f3 48 0f 2a c9       	cvtsi2ss %rcx,%xmm1
    aacd:	f3 0f 5a c9          	cvtss2sd %xmm1,%xmm1
    aad1:	f2 0f 59 0d 9f 15 00 	mulsd  0x159f(%rip),%xmm1        # c078 <_IO_stdin_used+0x1078>
    aad8:	00 
    aad9:	48 85 db             	test   %rbx,%rbx
    aadc:	78 62                	js     ab40 <measureFunction+0x100>
    aade:	66 0f ef c0          	pxor   %xmm0,%xmm0
    aae2:	f3 48 0f 2a c3       	cvtsi2ss %rbx,%xmm0
    aae7:	f3 0f 5a c0          	cvtss2sd %xmm0,%xmm0
    aaeb:	48 8b 44 24 48       	mov    0x48(%rsp),%rax
    aaf0:	64 48 33 04 25 28 00 	xor    %fs:0x28,%rax
    aaf7:	00 00 
    aaf9:	f2 0f 5e c8          	divsd  %xmm0,%xmm1
    aafd:	f3 0f 10 05 83 15 00 	movss  0x1583(%rip),%xmm0        # c088 <_IO_stdin_used+0x1088>
    ab04:	00 
    ab05:	f2 0f 5a c9          	cvtsd2ss %xmm1,%xmm1
    ab09:	f3 0f 5e c1          	divss  %xmm1,%xmm0
    ab0d:	f3 0f 5e 44 24 0c    	divss  0xc(%rsp),%xmm0
    ab13:	75 46                	jne    ab5b <measureFunction+0x11b>
    ab15:	48 83 c4 58          	add    $0x58,%rsp
    ab19:	5b                   	pop    %rbx
    ab1a:	5d                   	pop    %rbp
    ab1b:	c3                   	retq   
    ab1c:	0f 1f 40 00          	nopl   0x0(%rax)
    ab20:	48 89 c8             	mov    %rcx,%rax
    ab23:	83 e1 01             	and    $0x1,%ecx
    ab26:	66 0f ef c9          	pxor   %xmm1,%xmm1
    ab2a:	48 d1 e8             	shr    %rax
    ab2d:	48 09 c8             	or     %rcx,%rax
    ab30:	f3 48 0f 2a c8       	cvtsi2ss %rax,%xmm1
    ab35:	f3 0f 58 c9          	addss  %xmm1,%xmm1
    ab39:	eb 92                	jmp    aacd <measureFunction+0x8d>
    ab3b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
    ab40:	48 89 d8             	mov    %rbx,%rax
    ab43:	83 e3 01             	and    $0x1,%ebx
    ab46:	66 0f ef c0          	pxor   %xmm0,%xmm0
    ab4a:	48 d1 e8             	shr    %rax
    ab4d:	48 09 d8             	or     %rbx,%rax
    ab50:	f3 48 0f 2a c0       	cvtsi2ss %rax,%xmm0
    ab55:	f3 0f 58 c0          	addss  %xmm0,%xmm0
    ab59:	eb 8c                	jmp    aae7 <measureFunction+0xa7>
    ab5b:	e8 e0 64 ff ff       	callq  1040 <__stack_chk_fail@plt>

000000000000ab60 <spacedload256wrapper>:
    ab60:	f3 0f 1e fa          	endbr64 
    ab64:	48 8b 35 95 35 00 00 	mov    0x3595(%rip),%rsi        # e100 <intTestArr>
    ab6b:	e9 b9 f3 ff ff       	jmpq   9f29 <spacedload128>

000000000000ab70 <__libc_csu_init>:
    ab70:	f3 0f 1e fa          	endbr64 
    ab74:	41 57                	push   %r15
    ab76:	4c 8d 3d fb 31 00 00 	lea    0x31fb(%rip),%r15        # dd78 <__init_array_start>
    ab7d:	41 56                	push   %r14
    ab7f:	49 89 d6             	mov    %rdx,%r14
    ab82:	41 55                	push   %r13
    ab84:	49 89 f5             	mov    %rsi,%r13
    ab87:	41 54                	push   %r12
    ab89:	41 89 fc             	mov    %edi,%r12d
    ab8c:	55                   	push   %rbp
    ab8d:	48 8d 2d f4 31 00 00 	lea    0x31f4(%rip),%rbp        # dd88 <__do_global_dtors_aux_fini_array_entry>
    ab94:	53                   	push   %rbx
    ab95:	4c 29 fd             	sub    %r15,%rbp
    ab98:	48 83 ec 08          	sub    $0x8,%rsp
    ab9c:	e8 5f 64 ff ff       	callq  1000 <_init>
    aba1:	48 c1 fd 03          	sar    $0x3,%rbp
    aba5:	74 1f                	je     abc6 <__libc_csu_init+0x56>
    aba7:	31 db                	xor    %ebx,%ebx
    aba9:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)
    abb0:	4c 89 f2             	mov    %r14,%rdx
    abb3:	4c 89 ee             	mov    %r13,%rsi
    abb6:	44 89 e7             	mov    %r12d,%edi
    abb9:	41 ff 14 df          	callq  *(%r15,%rbx,8)
    abbd:	48 83 c3 01          	add    $0x1,%rbx
    abc1:	48 39 dd             	cmp    %rbx,%rbp
    abc4:	75 ea                	jne    abb0 <__libc_csu_init+0x40>
    abc6:	48 83 c4 08          	add    $0x8,%rsp
    abca:	5b                   	pop    %rbx
    abcb:	5d                   	pop    %rbp
    abcc:	41 5c                	pop    %r12
    abce:	41 5d                	pop    %r13
    abd0:	41 5e                	pop    %r14
    abd2:	41 5f                	pop    %r15
    abd4:	c3                   	retq   
    abd5:	66 66 2e 0f 1f 84 00 	data16 nopw %cs:0x0(%rax,%rax,1)
    abdc:	00 00 00 00 

000000000000abe0 <__libc_csu_fini>:
    abe0:	f3 0f 1e fa          	endbr64 
    abe4:	c3                   	retq   

Disassembly of section .fini:

000000000000abe8 <_fini>:
    abe8:	f3 0f 1e fa          	endbr64 
    abec:	48 83 ec 08          	sub    $0x8,%rsp
    abf0:	48 83 c4 08          	add    $0x8,%rsp
    abf4:	c3                   	retq   


================================================
FILE: InstructionRate/x86_fusion.c
================================================
/* This is a one-off microbenchmark for attempts to figure out what
 * instructions are fused on Centaur's CNS
 */
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <cpuid.h>

// make mingw happy for cross compiling
#ifdef __MINGW32__
#define aligned_alloc(align, size) _aligned_malloc(size, align)
#endif

extern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t testfusion(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t cmpfusion(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t subfusion(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t nopfusion(uint64_t iterations) __attribute((sysv_abi));

float fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 };
float fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 };
int *intTestArr;
int intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 };

float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t));

int main(int argc, char *argv[]) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t iterations = 1500000000;
  uint64_t iterationsHigh = iterations * 5;
  uint64_t time_diff_ms;
  float latency, opsPerNs, clockSpeedGhz;
  uint64_t intTestArrLength = 1024;

  intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength);
  for (uint64_t i = 0; i < intTestArrLength; i++) {
    intTestArr[i] = i;
  }

  if (argc > 2) {
    iterationsHigh =  1500000000 * (uint64_t)atol(argv[2]);
    printf("setting %lu iterations\n", iterationsHigh);
  }

  // figure out clock speed
  gettimeofday(&startTv, &startTz);
  clktest(iterationsHigh);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
  // clk speed should be 1/latency, assuming we got one add per clk, roughly
  clockSpeedGhz = 1/latency;

  printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz);

  // throughput
  printf("2-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
  printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
  printf("test+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, testfusion));
  printf("cmp+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmpfusion));
  printf("sub+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subfusion));
  printf("nop+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, nopfusion));

  return 0;
}

float measureFunction(uint64_t iterations, float clockSpeedGhz,  __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t time_diff_ms, retval;
  float latency, opsPerNs;

  gettimeofday(&startTv, &startTz);
  retval = testfunc(iterations);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  opsPerNs = 1/latency;
  //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz);
  //printf("return value: %lu\n", retval);
  return opsPerNs / clockSpeedGhz;
}


================================================
FILE: InstructionRate/x86_fusion.s
================================================
.text

.global clktest
.global addtest
.global noptest
.global testfusion
.global cmpfusion
.global subfusion
.global nopfusion

testfusion:
  push %rbx
  push %r8
  push %r9
  push %r10
  xor %rax, %rax
  not %rax
testfusion_loop:
  xor %r8, %r8
  xor %r9, %r9
  sub $5, %rdi
  test %rdi, %rax
  jnz testfusion_loop
  pop %r10
  pop %r9
  pop %r8
  pop %rbx
  ret

cmpfusion:
  push %rbx
  push %r8
  push %r9
  push %r10
  xor %rax, %rax
cmpfusion_loop:
  xor %r8, %r8
  xor %r9, %r9
  sub $5, %rdi
  cmp %rdi, %rax
  jnz cmpfusion_loop
  pop %r10
  pop %r9
  pop %r8
  pop %rbx
  ret

subfusion:
  push %rbx
  push %r8
  push %r9
  push %r10
  xor %rax, %rax
subfusion_loop:
  xor %r8, %r8
  xor %r9, %r9
  xor %r10, %r10
  sub $5, %rdi
  jnz subfusion_loop
  pop %r10
  pop %r9
  pop %r8
  pop %rbx
  ret

nopfusion:
  push %rbx
  push %r8
  push %r9
  push %r10
  xor %rax, %rax
nopfusion_loop:
  sub $5, %rdi
  nop
  nop
  nop
  jnz nopfusion_loop
  pop %r10
  pop %r9
  pop %r8
  pop %rbx
  ret

clktest:
  push %rbx
  push %r8
  push %r9
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
clktest_loop:
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  sub %r9, %rdi
  jnz clktest_loop
  pop %r9
  pop %r8
  pop %rbx
  ret

noptest:
  push %rbx
  push %r9
  mov $20, %r9
noptest_loop:
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  sub %r9, %rdi
  jnz noptest_loop
  pop %r9
  pop %rbx
  ret

addtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
addtest_loop:
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  sub %r9, %rdi
  jnz addtest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret


================================================
FILE: InstructionRate/x86_instructionrate.c
================================================
/* This is a one-off microbenchmark for attempts to dissect
 * Zhaoxin's KX-6640MA (LuJiaZui) architecture
 */
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <cpuid.h>
#include <pthread.h>
#include <xmmintrin.h>
#include <pmmintrin.h>

// make mingw happy for cross compiling
#ifdef __MINGW32__
#define aligned_alloc(align, size) _aligned_malloc(size, align)
#endif

extern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t noptest1b(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t clkmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addnoptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t leatest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t leamultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t rortest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t shltest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t rorbtstest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixrormultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixrorshltest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t btstest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t btsmultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addmultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t addjmptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t jmpmultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t jmptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t ntjmptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixadd256int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixadd256int11(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixadd256fpint(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mix256fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mix256fp11(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latadd512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latadd256int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latadd128int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latadd256fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul128int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul256int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmulq512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmuldq512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul256fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latadd128fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul128fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latfma512(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latfma256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latfma128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t add128int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t add256int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t add512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t muldq512int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul128int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t add128fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul128fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fma512(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfma256fma512(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mix21fma256fma512(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fma256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fma128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfmafadd256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfmaadd256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfmaadd512(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfma512add256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfmaand256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixfmaandmem256(uint64_t iterations, float *arr) __attribute((sysv_abi));
extern uint64_t mixfmaaddmem256(uint64_t iterations, float *arr) __attribute((sysv_abi));
extern uint64_t nemesfpumix21(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t nemesfpu512mix21(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul256fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t add256fp(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul64(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t latmul16(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul16(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mul64(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t load128(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t spacedload128(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t loadscalar(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t mixedscalarloadstore(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t load256(uint64_t iterations, float *arr) __attribute((sysv_abi));
extern uint64_t load512(uint64_t iterations, float *arr) __attribute((sysv_abi));
extern uint64_t store128(uint64_t iterations, int *arr, int *sink) __attribute((sysv_abi));
extern uint64_t store256(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi));
extern uint64_t store512(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi));
extern uint64_t mixaddmul128int(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixmul16mul64(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mixmul16mul64_21(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t pdeptest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t pdepmultest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t pexttest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t indepmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t vecindepmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t depmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t vecdepmovtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t xorzerotest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t vecxorzerotest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t movzerotest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t subzerotest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t vecsubzerotest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t depinctest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t depdectest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t depaddimmtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t memrenametest(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t spacedstorescalar(uint64_t iterations, int *arr) __attribute((sysv_abi));
extern uint64_t aesenc128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t aesdec128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t aesencfadd128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t aesencadd128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t aesencfma128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t aesencmul128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t mix256faddintadd(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t movqtoxmmtest(uint64_t iterations) __attribute((sysv_abi));

extern uint64_t fma4_256(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fma4_128(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fdivtest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fdivlattest(uint64_t iterations) __attribute((sysv_abi));
extern uint64_t fmuldenormtest(uint64_t iterations) __attribute((sysv_abi)); 
extern uint64_t fmuldenormlattest(uint64_t iterations) __attribute((sysv_abi)); 

float fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 };
float fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 };
int *intTestArr;
int intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 };

uint64_t load128wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t loadscalarwrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t mixedscalarloadstorewrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t spacedload128wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t spacedstorescalarwrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t load256wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t load512wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t store128wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t store256wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t store512wrapper(uint64_t iterations) __attribute((sysv_abi));
uint64_t mixfmaandmem256wrapper(uint64_t iterations)  __attribute((sysv_abi));
uint64_t mixfmaaddmem256wrapper(uint64_t iterations)  __attribute((sysv_abi));
uint64_t memrenamewrapper(uint64_t iterations) __attribute((sysv_abi));

float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t));

int threads = 0;

int main(int argc, char *argv[]) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t iterations = 1500000000;
  uint64_t iterationsHigh = iterations * 5;
  uint64_t time_diff_ms;
  float latency, opsPerNs, clockSpeedGhz;
  uint64_t intTestArrLength = 1024;
  int avxSupported = 0, avx2Supported = 0, bmi2Supported = 0, avx512Supported = 0;
  int fmaSupported = 0, fma4Supported = 0;
  char *testName = NULL;

  if (argc > 1) {
      for (int argIdx = 1; argIdx < argc; argIdx++) {
          if (*(argv[argIdx]) == '-') {
              char *arg = argv[argIdx] + 1;
              if (strncmp(arg, "threads", 7) == 0) {
                  argIdx++;
                  threads = atoi(argv[argIdx]);
                  fprintf(stderr, "Multithreading mode, %d threads\n", threads);
              } else if (strncmp(arg, "iter", 4) == 0) {
                  argIdx++;
                  int iterMul = atoi(argv[argIdx]);
                  iterations *= iterMul;
                  iterationsHigh *= iterMul;
                  fprintf(stderr, "Scaled iterations by %d\n", iterMul);
              } else if (strncmp(arg, "test", 4) == 0) {
                  argIdx++;
                  testName = argv[argIdx];
                  fprintf(stderr, "Only running test %s\n", testName);
              }
          }
      }
  }

  intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength);
  for (uint64_t i = 0; i < intTestArrLength; i++) {
    intTestArr[i] = i;
  }

  if (__builtin_cpu_supports("avx")) {
    fprintf(stderr, "avx supported\n");
    avxSupported = 1;
  }

  if (__builtin_cpu_supports("avx2")) {
    fprintf(stderr, "avx2 supported\n");
    avx2Supported = 1;
  }

  if (__builtin_cpu_supports("bmi2")) {
    fprintf(stderr, "bmi2 supported\n");
    bmi2Supported = 1;
  }

  if (__builtin_cpu_supports("fma")) {
      fprintf(stderr, "fma3 supported\n");
      fmaSupported = 1;
  }

  if (__builtin_cpu_supports("fma4")) {
      fprintf(stderr, "fma4 supported\n");
      fma4Supported = 1;
  }

  uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;
  __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);
  if (cpuidEbx & (1UL << 16)) {
      fprintf(stderr, "AVX512 supported\n");
      avx512Supported = 1;
  }

  // figure out clock speed
  gettimeofday(&startTv, &startTz);
  clktest(iterationsHigh);
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
  // clk speed should be 1/latency, assuming we got one add per clk, roughly
  clockSpeedGhz = 1/latency;

  printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz);

  // avx-512 testing
  if (avx512Supported) {
    if (testName == NULL || argc > 1 && strncmp(argv[1], "fma512", 6) == 0)
      printf("512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma512));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "latfma512", 9) == 0)
      printf("512-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma512));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfma256fma512", 15) == 0)
      printf("1:1 256-bit/512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixfma256fma512));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "mix21fma256fma512", 17) == 0)
      printf("2:1 256-bit/512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mix21fma256fma512));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "nemesfpumix21", 13) == 0)
      printf("1:2 512b FMA:FADD per clk (nemes): %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, nemesfpu512mix21));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "add512int", 9) == 0)
      printf("512-bit int add per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, add512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd512int", 12) == 0)
      printf("512-bit int add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "mul512int", 9) == 0)
      printf("512-bit 32-bit int mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "muldq512int", 9) == 0)
      printf("512-bit 32->64-bit int mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, muldq512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "latmulq512int", 13) == 0)
      printf("512-bit 64-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmulq512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul512int", 12) == 0)
      printf("512-bit 32-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmul512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "latmuldq512int", 13) == 0)
      printf("512-bit 32->64-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmuldq512int));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaadd512", 11) == 0)
      printf("1:2 512b PADDQ:FMA per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd512));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfma512add256", 11) == 0)
      printf("1:2 256b PADDQ : 512b FMA per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfma512add256));


    if (testName == NULL || argc > 1 && strncmp(argv[1], "load512", 7) == 0)
      printf("512-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load512wrapper));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "store512", 7) == 0)
      printf("512-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store512wrapper));

    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesenc128", 9) == 0)
      printf("aesenc per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesenc128));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesdec128", 9) == 0)
      printf("aesdec per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesdec128));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencadd128", 12) == 0)
      printf("1:3 aesenc+paddd per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencadd128));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencfma128", 12) == 0)
      printf("1:2 aesenc+fma per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencfma128));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencmul128", 12) == 0)
      printf("1:2 aesenc+pmullw per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencmul128));
    if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencmul128", 12) == 0)
      printf("1:2 aesenc+addps per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencfadd128));
  }

  // throughput
  if (testName == NULL || argc > 1 && strncmp(argv[1], "1bnop", 5) == 0)
    printf("1-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest1b));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "2bnop", 5) == 0)
    printf("2-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "add", 3) == 0)
    printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "addnop", 7) == 0)
    printf("1:4 nops/adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "addmov", 7) == 0)
    printf("1:4 movs/adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest));

  // renamer throughput
  printf("--- Renamer tests ---\n");
  if (testName == NULL || argc > 1 && strncmp(argv[1], "depmov", 6) == 0)
    printf("Dependent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "indepmov", 8) == 0)
    printf("Independent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "xorzero", 7) == 0)
    printf("xor -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "movzero", 7) == 0)
    printf("mov -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "subzero", 7) == 0)
    printf("sub -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "depinc", 6) == 0)
    printf("dep inc per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depinctest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "depdec", 6) == 0)
    printf("dep dec per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depdectest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "depdec", 6) == 0)
    printf("dep add immediate per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depaddimmtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "clkmov", 6) == 0)
    printf("dep add + mov pair per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, clkmovtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "vecdepmov", 9) == 0)
    printf("Dependent vec movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecdepmovtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "vecindepmov", 12) == 0)
    printf("Independent vec movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecindepmovtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "vecxorzero", 10) == 0)
    printf("xor xmm -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecxorzerotest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "vecsubzero", 10) == 0)
    printf("sub xmm -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecsubzerotest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "memrename", 9) == 0) 
    printf("mov -> [r] -> mov latency: %.2f\n", 1 / measureFunction(iterations, clockSpeedGhz, memrenamewrapper));
  // misc mixed integer tests
  if (testName == NULL || argc > 1 && strncmp(argv[1], "miximuladd", 10) == 0)
    printf("4:1 adds/imul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "jmpmul", 6) == 0)
    printf("1:1 mul/jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmpmultest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "addjmp", 6) == 0)
    printf("3:1 add/jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addjmptest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "jmp", 3) == 0)
    printf("taken jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "ntjmp", 5) == 0)
    printf("nt jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, ntjmptest));
  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pdep", 4) == 0))
    printf("pdep per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pdeptest));
  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pext", 4) == 0))
    printf("pext per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pexttest));
  if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pdepmul", 7) == 0))
    printf("1:1 pdep/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pdepmultest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "shl", 3) == 0)
    printf("shl r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, shltest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "ror", 3) == 0)
    printf("ror r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrorshl", 9) == 0)
    printf("1:1 shl/ror r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixrorshltest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrormul", 3) == 0)
    printf("1:1 ror/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixrormultest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "bts", 3) == 0)
    printf("bts per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, btstest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmulbts", 9) == 0)
    printf("1:1 bts/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, btsmultest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrorbts", 9) == 0)
    printf("1:1 bts/ror per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, rorbtstest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "lea", 3) == 0)
    printf("lea r+r*8 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, leatest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmullea", 9) == 0)
    printf("1:1 lea r+r*8/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, leamultest));

  // vector and FP
  if (testName == NULL || argc > 1 && strncmp(argv[1], "fdiv", 4) == 0) 
    printf("divss per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fdivtest));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latfdiv", 7) == 0)
    printf("divss latency: %.2f\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, fdivlattest));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "avx256int", 9) == 0))
    printf("256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add256int));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixavx256int", 12) == 0))
    printf("2:1 scalar add/256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix11avx256int", 14) == 0))
    printf("1:1 scalar add/256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int11));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixavx256fpint", 14) == 0))
    printf("1:1 256-bit avx int add/avx fadd per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256fpint));
  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix256fp", 8) == 0))
    printf("1:1 256-bit avx fp mul/add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix256fp));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latadd256int", 12) == 0))
    printf("256-bit avx2 integer add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latmul256int", 12) == 0))
    printf("256-bit avx2 integer multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul256int));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd128int", 12) == 0)
    printf("128-bit sse integer add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd128int));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul128int", 12) == 0)
    printf("128-bit sse integer multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul128int));
  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "latadd256fp", 11) == 0))
    printf("256-bit avx fadd latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latadd256fp));
  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "latmul256fp", 11) == 0))
    printf("256-bit avx fmul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul256fp));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd128fp", 11) == 0)
    printf("128-bit sse fadd latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latadd128fp));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul128fp", 11) == 0)
    printf("128-bit sse fmul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul128fp));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "add128fp", 8) == 0)
    printf("128-bit sse fadd per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add128fp));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mul128fp", 8) == 0)
    printf("128-bit sse fmul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul128fp));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "add128int", 9) == 0)
    printf("128-bit sse int add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add128int));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mul128int", 9) == 0)
    printf("128-bit sse int mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul128int));

  // set no ftz or daz
  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
  if (argc == 1 || argc > 1 && strncmp(argv[1], "fmuldenorm", 10) == 0)  {
    float denormTp = measureFunction(iterations, clockSpeedGhz, fmuldenormtest);
    printf("Scalar FP32 multiply -> denorm per clk: %.2f (%.2f recip)\n", denormTp, 1/denormTp);
  }

  _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 
  if (argc == 1 || argc > 1 && strncmp(argv[1], "fmuldenormftz", 13) == 0) {
    printf("Scalar FP32 multiply -> denorm (ftz/daz) per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fmuldenormtest)); 
  }

  if (fmaSupported) {
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fma256", 6) == 0))
          printf("256-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma256));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fma128", 6) == 0))
          printf("128-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma128));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latfma256", 9) == 0))
          printf("256-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma256));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latfma128", 9) == 0))
          printf("128-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma128));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmafadd256", 12) == 0))
          printf("1:2 256b FMA:FADD per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmafadd256));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaadd256", 11) == 0))
          printf("2:1 256b FMA:PADDQ per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd256));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaandmem256", 14) == 0))
          printf("2:1 256b FMA:PADDQ load-op per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaaddmem256wrapper));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaand256", 11) == 0))
          printf("2:1 256b FMA:PAND per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaand256));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaandmem256", 14) == 0))
          printf("2:1 256b FMA:PAND load-op per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaandmem256wrapper));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "nemesfpumix21", 13) == 0))
          printf("1:2 256b FMA:FADD per clk (nemes): %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, nemesfpumix21));
      if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix256faddintadd", 15) == 0))
          printf("1:2 256b FMA:PADD per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mix256faddintadd));
  }

  if (fma4Supported)
  {
      if (testName == NULL || argc > 1 && strncmp(argv[1], "fma4_256", 8) == 0)
          printf("256-bit FMA4 per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma4_256));
      if (testName == NULL || argc > 1 && strncmp(argv[1], "fma4_256", 8) == 0)
          printf("128-bit FMA4 per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma4_128));
  }

  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fadd256", 6) == 0))
    printf("256-bit FADD per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, add256fp));
  if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fmul256", 6) == 0))
    printf("256-bit FMUL per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul256fp));

  if (testName == NULL || argc > 1 && strncmp(argv[1], "movqtoxmm", 9) == 0) 
    printf("MOVQ GPR <-> XMM: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, movqtoxmmtest));

  // integer multiply. zhaoxin appears to handle 16-bit and 64-bit multiplies differntly
  // unlike Intel/AMD CPUs that behave similarly regardless of register width
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul16", 8) == 0)
    printf("16-bit imul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul16));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul64", 8) == 0)
    printf("64-bit imul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mul16", 5) == 0)
    printf("16-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul16));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mul64", 5) == 0)
    printf("64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul64));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmul16mul64", 5) == 0)
    printf("1:1 mixed 16-bit/64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixmul16mul64));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mix21mul16mul64", 5) == 0)
    printf("2:1 mixed 16-bit/64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixmul16mul64_21));

  // load/store
  if (testName == NULL || argc > 1 && strncmp(argv[1], "loadscalar", 10) == 0)
    printf("64-bit scalar loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, loadscalarwrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixedscalarloadstore", 20) == 0)
    printf("2:1 64-bit scalar loads:stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixedscalarloadstorewrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "load128", 7) == 0)
    printf("128-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load128wrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "spacedload128", 13) == 0)
    printf("128-bit loads (spaced) per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, spacedload128wrapper));
  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "load256", 7) == 0))
    printf("256-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load256wrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "spacedstorescalar", 13) == 0)
    printf("scalar stores (spaced) per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, spacedstorescalarwrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "store128", 7) == 0)
    printf("128-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store128wrapper));
  if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "store256", 7) == 0))
    printf("256-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store256wrapper));
  if (testName == NULL || argc > 1 && strncmp(argv[1], "mixaddmul128int", 15) == 0)
    printf("1:1 mixed 128-bit vec add/mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixaddmul128int));

  return 0;
}

struct TestThreadData {
    uint64_t iterations;
    uint64_t (*testfunc)(uint64_t);
};

void *TestThread(void *param) {
    struct TestThreadData *testData = (struct TestThreadData *)param;
    testData->testfunc(testData->iterations);
    return NULL;
}

float measureFunction(uint64_t iterations, float clockSpeedGhz,  __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) {
  struct timeval startTv, endTv;
  struct timezone startTz, endTz;
  uint64_t time_diff_ms, retval;
  float latency, opsPerNs;

  gettimeofday(&startTv, &startTz);
  if (threads == 0) retval = testfunc(iterations);
  else {
      pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));
      struct TestThreadData *testData = (struct TestThreadData*)malloc(threads * sizeof(struct TestThreadData));
      for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
          testData[threadIdx].iterations = iterations;
          testData[threadIdx].testfunc = testfunc;
          pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);
      }

      for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
          pthread_join(testThreads[threadIdx], NULL);
      }

      free(testThreads);
      free(testData);
  }
  gettimeofday(&endTv, &endTz);
  time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
  latency = 1e6 * (float)time_diff_ms / (float)iterations;
  opsPerNs = 1/latency;
  //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz);
  //printf("return value: %lu\n", retval);
  return opsPerNs / clockSpeedGhz;
}

__attribute((sysv_abi)) uint64_t load128wrapper(uint64_t iterations) {
  return load128(iterations, intTestArr);
}

__attribute((sysv_abi)) uint64_t spacedload128wrapper(uint64_t iterations) {
  return spacedload128(iterations, intTestArr);
}

__attribute((sysv_abi)) uint64_t spacedstorescalarwrapper(uint64_t iterations) {
  return spacedstorescalar(iterations, intTestArr);
}

__attribute((sysv_abi)) uint64_t load256wrapper(uint64_t iterations) {
  return load256(iterations, fpTestArr);
}

__attribute((sysv_abi)) uint64_t loadscalarwrapper(uint64_t iterations) {
  return loadscalar(iterations, intTestArr);
}

__attribute((sysv_abi)) uint64_t mixedscalarloadstorewrapper(uint64_t iterations) {
  return mixedscalarloadstore(iterations, intTestArr);
}
 

__attribute((sysv_abi)) uint64_t load512wrapper(uint64_t iterations) {
  return load512(iterations, fpTestArr);
}

__attribute((sysv_abi)) uint64_t spacedload256wrapper(uint64_t iterations) {
  return spacedload128(iterations, intTestArr);
}

__attribute((sysv_abi)) uint64_t store128wrapper(uint64_t iterations) {
  return store128(iterations, intTestArr, intSinkArr);
}

__attribute((sysv_abi)) uint64_t store256wrapper(uint64_t iterations) {
  return store256(iterations, fpTestArr, fpSinkArr);
}

__attribute((sysv_abi)) uint64_t store512wrapper(uint64_t iterations) {
  return store512(iterations, fpTestArr, fpSinkArr);
}

__attribute((sysv_abi)) uint64_t mixfmaandmem256wrapper(uint64_t iterations) {
  return mixfmaandmem256(iterations, fpTestArr);
}

__attribute((sysv_abi)) uint64_t mixfmaaddmem256wrapper(uint64_t iterations) {
  return mixfmaaddmem256(iterations, fpTestArr);
}
__attribute((sysv_abi)) uint64_t memrenamewrapper(uint64_t iterations) {
  return memrenametest(iterations, intSinkArr);
}


================================================
FILE: InstructionRate/x86_instructionrate.s
================================================
.text

.global clktest
.global clkmovtest
.global addtest
.global addnoptest
.global addmovtest
.global rortest
.global shltest
.global mixrorshltest
.global mixrormultest
.global btstest
.global leatest
.global leamultest
.global rorbtstest
.global btsmultest
.global depmovtest
.global indepmovtest
.global vecindepmovtest
.global vecdepmovtest
.global xorzerotest
.global vecxorzerotest
.global movzerotest
.global subzerotest
.global vecsubzerotest
.global depinctest
.global depdectest
.global depaddimmtest
.global memrenametest
.global addmultest
.global jmpmultest
.global addjmptest
.global jmptest
.global ntjmptest
.global noptest
.global noptest1b
.global add256int
.global add512int
.global mul512int
.global muldq512int
.global mixadd256int
.global mixadd256int11
.global mixadd256fpint
.global mix256fp
.global latadd256int
.global latadd128int
.global latmul256int
.global latmul512int
.global latmulq512int
.global latmuldq512int
.global latmul128int
.global latadd256int
.global latmul256fp
.global latadd256fp
.global latmul128fp
.global latadd128fp
.global fma512
.global mixfma256fma512
.global mix21fma256fma512
.global fma256
.global fma128
.global mixfmafadd256
.global mixfmaadd256
.global mixfmaadd512
.global mixfma512add256
.global mixfmaand256
.global nemesfpumix21
.global nemesfpu512mix21
.global mixfmaandmem256
.global mixfmaaddmem256
.global latfma512
.global latfma256
.global latfma128
.global mul256fp
.global add256fp
.global add128fp
.global mul128fp
.global latmul64
.global latmul16
.global mul16
.global mul64
.global load128
.global spacedload128
.global load256
.global load512
.global store128
.global store256
.global store512
.global loadscalar
.global mixedscalarloadstore
.global spacedstorescalar
.global mixaddmul128int
.global mixmul16mul64
.global mixmul16mul64_21
.global add128int
.global mul128int
.global mix256faddintadd
.global movqtoxmmtest

.global pdeptest
.global pexttest
.global pdepmultest

.global aesenc128
.global aesdec128
.global aesencadd128
.global aesencfma128
.global aesencfadd128
.global aesencmul128

.global fma4_256
.global fma4_128
.global fdivtest
.global fdivlattest
.global fmuldenormtest
.global fmuldenormlattest

/*
  %rdi = arg0 = iteration count
*/
clktest:
  push %rbx
  push %r8
  push %r9
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
clktest_loop:
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  add %r8, %rbx
  sub %r9, %rdi
  jnz clktest_loop
  pop %r9
  pop %r8
  pop %rbx
  ret

clkmovtest:
  push %rbx
  push %r8
  push %r9
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
clkmovtest_loop:
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  add %r8, %rbx
  mov %rbx, %r8
  sub %r9, %rdi
  jnz clkmovtest_loop
  pop %r9
  pop %r8
  pop %rbx
  ret

noptest:
  push %rbx
  push %r9
  mov $20, %r9
noptest_loop:
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  xchg %ax,%ax
  sub %r9, %rdi
  jnz noptest_loop
  pop %r9
  pop %rbx
  ret

noptest1b:
  push %rbx
  push %r9
  mov $20, %r9
noptest1b_loop:
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  nop
  sub %r9, %rdi
  jnz noptest1b_loop
  pop %r9
  pop %rbx
  ret

addtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
addtest_loop:
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  sub %r9, %rdi
  jnz addtest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

addnoptest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
addnoptest_loop:
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  nop
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  nop
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  nop
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  nop
  sub %r9, %rdi
  jnz addnoptest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

addmovtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
addmovtest_loop:
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  mov %r15, %rdx
  add %r8, %r10
  add %r8, %rcx
  add %r8, %r15
  add %r8, %r14
  mov %r15, %rdx
  add %r8, %r12
  add %r8, %r11
  add %r8, %r10
  add %r8, %rcx
  mov %r15, %rdx
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  add %r8, %r11
  mov %r15, %rdx
  sub %r9, %rdi
  jnz addmovtest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

rortest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
rortest_loop:
  ror $1, %r15
  ror $1, %r14
  ror $1, %r13
  ror $1, %r12
  ror $1, %r11
  ror $1, %r15
  ror $1, %r14
  ror $1, %r13
  ror $1, %r12
  ror $1, %r11
  ror $1, %r15
  ror $1, %r14
  ror $1, %r13
  ror $1, %r12
  ror $1, %r11
  ror $1, %r15
  ror $1, %r14
  ror $1, %r13
  ror $1, %r12
  ror $1, %r11
  sub %r9, %rdi
  jnz rortest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

shltest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
shltest_loop:
  shl $1, %r15
  shl $1, %r14
  shl $1, %r13
  shl $1, %r12
  shl $1, %r11
  shl $1, %r15
  shl $1, %r14
  shl $1, %r13
  shl $1, %r12
  shl $1, %r11
  shl $1, %r15
  shl $1, %r14
  shl $1, %r13
  shl $1, %r12
  shl $1, %r11
  shl $1, %r15
  shl $1, %r14
  shl $1, %r13
  shl $1, %r12
  shl $1, %r11
  sub %r9, %rdi
  jnz shltest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

mixrorshltest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
mixrorshltest_loop:
  ror $1, %r15
  shl $1, %r14
  ror $1, %r13
  shl $1, %r12
  ror $1, %r11
  shl $1, %r15
  ror $1, %r14
  shl $1, %r13
  ror $1, %r12
  shl $1, %r11
  ror $1, %r15
  shl $1, %r14
  ror $1, %r13
  shl $1, %r12
  ror $1, %r11
  shl $1, %r15
  ror $1, %r14
  shl $1, %r13
  ror $1, %r12
  shl $1, %r11
  sub %r9, %rdi
  jnz mixrorshltest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

mixrormultest:
  push %rbx
  push %rcx
  push %rsi
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $3, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
mixrormultest_loop:
  ror $1, %r15
  imul %r8, %r14
  mov %r9, %r14
  ror $1, %r13
  imul %r8, %r12
  mov %r9, %r12
  ror $1, %r11
  imul %r8, %r10
  mov %r9, %r10
  ror $1, %rbx
  imul %r8, %rcx
  mov %r9, %rcx
  ror $1, %rsi
  imul %r8, %rax
  mov %r9, %rax
  ror $1, %r15
  imul %r8, %r14
  mov %r9, %r14
  ror $1, %r13
  imul %r8, %r12
  mov %r9, %r12
  ror $1, %r11
  imul %r8, %r10
  mov %r9, %r10
  ror $1, %rbx
  imul %r8, %rcx
  mov %r9, %rcx
  ror $1, %rsi
  imul %r8, %rdx
  sub %r9, %rdi
  jnz mixrormultest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rsi
  pop %rcx
  pop %rbx
  ret

rorbtstest:
  push %rbx
  push %rcx
  push %rdx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
  inc %r8
rorbtstest_loop:
  bts %r8, %r15
  ror $1, %r14
  bts %r8, %r13
  ror $1, %r12
  bts %r8, %r11
  ror $1, %r10
  bts %r8, %rcx
  ror $1, %rbx
  bts %r8, %rdx
  ror $1, %rsi
  bts %r8, %r15
  ror $1, %r14
  bts %r8, %r13
  ror $1, %r12
  bts %r8, %r11
  ror $1, %r10
  bts %r8, %rcx
  ror $1, %rbx
  bts %r8, %rdx
  ror $1, %rsi
  sub %r9, %rdi
  jnz rorbtstest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rdx
  pop %rcx
  pop %rbx
  ret

btstest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
  inc %r8
btstest_loop:
  bts %r8, %r15
  bts %r8, %r14
  bts %r8, %r13
  bts %r8, %r12
  bts %r8, %r11
  bts %r8, %r15
  bts %r8, %r14
  bts %r8, %r13
  bts %r8, %r12
  bts %r8, %r11
  bts %r8, %r15
  bts %r8, %r14
  bts %r8, %r13
  bts %r8, %r12
  bts %r8, %r11
  bts %r8, %r15
  bts %r8, %r14
  bts %r8, %r13
  bts %r8, %r12
  bts %r8, %r11
  sub %r9, %rdi
  jnz btstest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

leatest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
  inc %r8
leatest_loop:
  lea (%r9,%r10,8), %r10
  lea (%r9,%r11,8), %r11
  lea (%r9,%r12,8), %r12
  lea (%r9,%r13,8), %r13
  lea (%r9,%r14,8), %r14
  lea (%r9,%r15,8), %r15
  lea (%r9,%r10,8), %r10
  lea (%r9,%r11,8), %r11
  lea (%r9,%r12,8), %r12
  lea (%r9,%r13,8), %r13
  lea (%r9,%r14,8), %r14
  lea (%r9,%r15,8), %r15
  lea (%r9,%r10,8), %r10
  lea (%r9,%r11,8), %r11
  lea (%r9,%r12,8), %r12
  lea (%r9,%r13,8), %r13
  lea (%r9,%r14,8), %r14
  lea (%r9,%r15,8), %r15
  lea (%r9,%r10,8), %r10
  lea (%r9,%r11,8), %r11
  sub %r9, %rdi
  jnz leatest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

leamultest:
  push %rbx
  push %rcx
  push %rdx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
  inc %r8
leamultest_loop:
  lea (%r9,%r15,8), %r15
  imul %r8, %r14
  mov %r8, %r14
  lea (%r9,%r13,8), %r13
  imul %r8, %r12
  mov %r8, %r12
  lea (%r9,%r11,8), %r11
  imul %r8, %r10
  mov %r8, %r10
  lea (%r9,%rbx,8), %rbx
  imul %r8, %rcx
  mov %r8, %rcx
  lea (%r9,%rdx,8), %rdx
  imul %r8, %rax
  lea (%r9,%r15,8), %r15
  imul %r8, %r14
  lea (%r9,%r13,8), %r13
  imul %r8, %r12
  lea (%r9,%r11,8), %r11
  imul %r8, %r10
  lea (%r9,%rbx,8), %rbx
  imul %r8, %rcx
  lea (%r9,%rdx,8), %rdx
  imul %r8, %rax
  sub %r9, %rdi
  jnz leamultest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rdx
  pop %rcx
  pop %rbx
  ret


btsmultest:
  push %rbx
  push %rcx
  push %rsi
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rdx
  mov %r8, %rsi
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r8, %r15
  inc %r8
btsmultest_loop:
  imul %r8, %r14
  bts %r8, %r13
  mov %r8, %r13
  imul %r8, %r12
  bts %r8, %r11
  mov %r8, %r11
  imul %r8, %r10
  bts %r8, %rbx
  imul %r8, %rcx
  mov %r8, %rcx
  bts %r8, %rsi
  imul %r8, %rax
  mov %r8, %rax
  bts %r8, %r15
  imul %r8, %r14
  mov %r8, %r14
  bts %r8, %r13
  imul %r8, %r12
  mov %r8, %r12
  bts %r8, %r11
  imul %r8, %r10
  mov %r8, %r10
  bts %r8, %rbx
  imul %r8, %rcx
  mov %r8, %rcx
  bts %r8, %rsi
  imul %r8, %rdx
  mov %r8, %rdx
  bts %r8, %r11
  sub %r9, %rdi
  jnz btsmultest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rsi
  pop %rcx
  pop %rbx
  ret


jmptest:
  push %rsi
  push %rbx
  push %rcx
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
  xor %rsi, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %rsi
  mov %r8, %rax
  mov %r8, %rdx
jmptest_loop:
  jmp jmptest1
  add $1, %rax
jmptest1:
  jmp jmptest2
  add $2, %rax
jmptest2:
  jmp jmptest3
  add $3, %rax
jmptest3:
  jmp jmptest4
  add $4, %rax
jmptest4:
  jmp jmptest5
  add $5, %rax
jmptest5:
  jmp jmptest6
  add $6, %rax
jmptest6:
  jmp jmptest7
  add $7, %rax
jmptest7:
  jmp jmptest8
  add $8, %rax
jmptest8:
  jmp jmptest9
  add $9, %rax
jmptest9:
  jmp jmptest10
  add $10, %rax
jmptest10:
  jmp jmptest11
  add $11, %rax
jmptest11:
  jmp jmptest12
  add $12, %rax
jmptest12:
  jmp jmptest13
  add $13, %rax
jmptest13:
  jmp jmptest14
  add $14, %rax
jmptest14:
  jmp jmptest15
  add $15, %rax
jmptest15:
  jmp jmptest16
  add $16, %rax
jmptest16:
  jmp jmptest17
  add $17, %rax
jmptest17:
  jmp jmptest18
  add $18, %rax
jmptest18:
  jmp jmptest19
  add $19, %rax
jmptest19:      /* jump back counts as nr 20 */
  sub %r9, %rdi
  jnz jmptest_loop
jmptest_jellydonut:
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rcx
  pop %rbx
  pop %rsi
  ret

ntjmptest:
  push %rsi
  push %rbx
  push %rcx
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
  xor %rsi, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %rsi
  mov %r8, %rax
  mov %r8, %rdx
ntjmptest_loop:
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  cmp %r8, %r9
  je jmpmultest_jellydonut
  sub %r9, %rdi
  jnz ntjmptest_loop
ntjmptest_jellydonut:
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rcx
  pop %rbx
  pop %rsi
  ret

addjmptest:
  push %rsi
  push %rbx
  push %rcx
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $2, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
  xor %rsi, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %rsi
  mov %r8, %rax
  mov %r8, %rdx
addjmptest_loop:
  add %r8, %r10
  add %r11, %r12
  add %r13, %r14
  jnz addjmptest_jellydonut

  add %r8, %r10
  add %r11, %r12
  add %r13, %r14 
  jnz addjmptest_jellydonut

  add %r8, %r10
  add %r11, %r12
  add %r13, %r14 
  jnz addjmptest_jellydonut

  add %r8, %r10
  add %r11, %r12
  add %r13, %r14  
  jnz addjmptest_jellydonut

  add %r8, %r10
  add %r11, %r12
  add %r13, %r14    
  jnz addjmptest_jellydonut

  sub %r9, %rdi
  jnz addjmptest_loop
addjmptest_jellydonut:
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rcx
  pop %rbx
  pop %rsi
  ret 

jmpmultest:
  push %rsi
  push %rbx
  push %rcx
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $2, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
  xor %rsi, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %rsi
  mov %r8, %rax
  mov %r8, %rdx
jmpmultest_loop:
  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %r10d

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %esi

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %ebx

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %edx

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %r10d

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %esi

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %ebx

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %edx

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %r15d

  cmp %r8, %r9
  je jmpmultest_jellydonut
  imul %r8d, %r14d

  sub %r9, %rdi
  jnz jmpmultest_loop
jmpmultest_jellydonut:
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rcx
  pop %rbx
  pop %rsi
  ret

addmultest:
  push %rsi
  push %rbx
  push %rcx
  push %rdx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $40, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
  xor %rsi, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %rsi
  mov %r8, %rax
  mov %r8, %rdx
addmultest_loop:
  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %r10

  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rsi

  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rbx

  add %r8, %r15
  add %r8, %r15
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rdx

  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %r10

  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rsi

  add %r8, %r15
  add %r8, %r14
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rbx

  add %r8, %r15
  add %r8, %r13
  add %r8, %r12
  imul %r8, %rdx

  sub %r9, %rdi
  jnz addmultest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rdx
  pop %rcx
  pop %rbx
  pop %rsi
  ret

add256int:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastq %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
add256int_loop:
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  sub %r9, %rdi
  jnz add256int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mul512int:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
mul512int_loop:
  vpmulld %zmm0, %zmm1, %zmm1
  vpmulld %zmm0, %zmm2, %zmm2
  vpmulld %zmm0, %zmm3, %zmm3
  vpmulld %zmm0, %zmm4, %zmm4
  vpmulld %zmm0, %zmm5, %zmm5
  vpmulld %zmm0, %zmm1, %zmm1
  vpmulld %zmm0, %zmm2, %zmm2
  vpmulld %zmm0, %zmm3, %zmm3
  vpmulld %zmm0, %zmm4, %zmm4
  vpmulld %zmm0, %zmm5, %zmm5
  vpmulld %zmm0, %zmm1, %zmm1
  vpmulld %zmm0, %zmm2, %zmm2
  vpmulld %zmm0, %zmm3, %zmm3
  vpmulld %zmm0, %zmm4, %zmm4
  vpmulld %zmm0, %zmm5, %zmm5
  vpmulld %zmm0, %zmm1, %zmm1
  vpmulld %zmm0, %zmm2, %zmm2
  vpmulld %zmm0, %zmm3, %zmm3
  vpmulld %zmm0, %zmm4, %zmm4
  vpmulld %zmm0, %zmm5, %zmm5
  sub %r9, %rdi
  jnz mul512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

muldq512int:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
muldq512int_loop:
  vpmuldq %zmm0, %zmm1, %zmm1
  vpmuldq %zmm0, %zmm2, %zmm2
  vpmuldq %zmm0, %zmm3, %zmm3
  vpmuldq %zmm0, %zmm4, %zmm4
  vpmuldq %zmm0, %zmm5, %zmm5
  vpmuldq %zmm0, %zmm1, %zmm1
  vpmuldq %zmm0, %zmm2, %zmm2
  vpmuldq %zmm0, %zmm3, %zmm3
  vpmuldq %zmm0, %zmm4, %zmm4
  vpmuldq %zmm0, %zmm5, %zmm5
  vpmuldq %zmm0, %zmm1, %zmm1
  vpmuldq %zmm0, %zmm2, %zmm2
  vpmuldq %zmm0, %zmm3, %zmm3
  vpmuldq %zmm0, %zmm4, %zmm4
  vpmuldq %zmm0, %zmm5, %zmm5
  vpmuldq %zmm0, %zmm1, %zmm1
  vpmuldq %zmm0, %zmm2, %zmm2
  vpmuldq %zmm0, %zmm3, %zmm3
  vpmuldq %zmm0, %zmm4, %zmm4
  vpmuldq %zmm0, %zmm5, %zmm5
  sub %r9, %rdi
  jnz muldq512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

add512int:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
add512int_loop:
  vpaddq %zmm0, %zmm1, %zmm1
  vpaddq %zmm0, %zmm2, %zmm2
  vpaddq %zmm0, %zmm3, %zmm3
  vpaddq %zmm0, %zmm4, %zmm4
  vpaddq %zmm0, %zmm5, %zmm5
  vpaddq %zmm0, %zmm1, %zmm1
  vpaddq %zmm0, %zmm2, %zmm2
  vpaddq %zmm0, %zmm3, %zmm3
  vpaddq %zmm0, %zmm4, %zmm4
  vpaddq %zmm0, %zmm5, %zmm5
  vpaddq %zmm0, %zmm1, %zmm1
  vpaddq %zmm0, %zmm2, %zmm2
  vpaddq %zmm0, %zmm3, %zmm3
  vpaddq %zmm0, %zmm4, %zmm4
  vpaddq %zmm0, %zmm5, %zmm5
  vpaddq %zmm0, %zmm1, %zmm1
  vpaddq %zmm0, %zmm2, %zmm2
  vpaddq %zmm0, %zmm3, %zmm3
  vpaddq %zmm0, %zmm4, %zmm4
  vpaddq %zmm0, %zmm5, %zmm5
  sub %r9, %rdi
  jnz add512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixadd256fpint:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %ymm6
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
mixadd256fpint_loop:
  vpaddq %ymm0, %ymm1, %ymm1
  vaddps %ymm6, %ymm7, %ymm7
  vpaddq %ymm0, %ymm2, %ymm2
  vaddps %ymm6, %ymm8, %ymm8
  vpaddq %ymm0, %ymm3, %ymm3
  vaddps %ymm6, %ymm9, %ymm9
  vpaddq %ymm0, %ymm4, %ymm4
  vaddps %ymm6, %ymm10, %ymm10
  vpaddq %ymm0, %ymm5, %ymm5
  vaddps %ymm6, %ymm11, %ymm11
  vpaddq %ymm0, %ymm1, %ymm1
  vaddps %ymm6, %ymm7, %ymm7
  vpaddq %ymm0, %ymm2, %ymm2
  vaddps %ymm6, %ymm8, %ymm8
  vpaddq %ymm0, %ymm3, %ymm3
  vaddps %ymm6, %ymm9, %ymm9
  vpaddq %ymm0, %ymm4, %ymm4
  vaddps %ymm6, %ymm10, %ymm10
  vpaddq %ymm0, %ymm5, %ymm5
  vaddps %ymm6, %ymm11, %ymm11
  sub %r9, %rdi
  jnz mixadd256fpint_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mix256faddintadd:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %ymm8
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %ymm6
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm15
  vmovdqu %ymm8, %ymm10
  vmovdqu %ymm8, %ymm12
  vmovdqu %ymm8, %ymm14
mix256faddintadd_loop:
  vaddps %ymm6, %ymm7, %ymm7
  vpaddd %ymm8, %ymm8, %ymm8
  vaddps %ymm6, %ymm9, %ymm9
  vpaddd %ymm10, %ymm10, %ymm10
  vaddps %ymm6, %ymm11, %ymm11
  vpaddd %ymm12, %ymm12, %ymm12
  vaddps %ymm6, %ymm13, %ymm13
  vpaddd %ymm14, %ymm14, %ymm14
  vaddps %ymm6, %ymm15, %ymm15
  vpaddd %ymm5, %ymm5, %ymm5
  vaddps %ymm6, %ymm7, %ymm7
  vpaddd %ymm8, %ymm8, %ymm8
  vaddps %ymm6, %ymm9, %ymm9
  vpaddd %ymm10, %ymm10, %ymm10
  vaddps %ymm6, %ymm11, %ymm11
  vpaddd %ymm12, %ymm12, %ymm12
  vaddps %ymm6, %ymm13, %ymm13
  vpaddd %ymm14, %ymm14, %ymm14
  vaddps %ymm6, %ymm15, %ymm15
  vpaddd %ymm5, %ymm5, %ymm5
  sub %r9, %rdi
  jnz mix256faddintadd_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mix256fp:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
  vmovups %ymm6, %ymm5
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm12
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm14
  vmovups %ymm6, %ymm15
mix256fp_loop:
  vaddps %ymm6, %ymm7, %ymm7
  vmulps %ymm6, %ymm8, %ymm8
  vaddps %ymm6, %ymm9, %ymm9
  vmulps %ymm6, %ymm10, %ymm10
  vaddps %ymm6, %ymm11, %ymm11
  vmulps %ymm6, %ymm12, %ymm12
  vaddps %ymm6, %ymm13, %ymm13
  vmulps %ymm6, %ymm14, %ymm14
  vaddps %ymm6, %ymm15, %ymm15
  vmulps %ymm6, %ymm5, %ymm5
  vaddps %ymm6, %ymm7, %ymm7
  vmulps %ymm6, %ymm8, %ymm8
  vaddps %ymm6, %ymm9, %ymm9
  vmulps %ymm6, %ymm10, %ymm10
  vaddps %ymm6, %ymm11, %ymm11
  vmulps %ymm6, %ymm12, %ymm12
  vaddps %ymm6, %ymm13, %ymm13
  vmulps %ymm6, %ymm14, %ymm14
  vaddps %ymm6, %ymm15, %ymm15
  vmulps %ymm6, %ymm5, %ymm5
  sub %r9, %rdi
  jnz mix256fp_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixadd256int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $30, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
  mov %r9, %r15
  mov %r9, %r14
  mov %r9, %r13
  mov %r9, %r12
  mov %r9, %r11
  mov %r9, %r8
mixadd256int_loop:
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  sub %r9, %rdi
  jnz mixadd256int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

mixadd256int11:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
  mov %r9, %r15
  mov %r9, %r14
  mov %r9, %r13
  mov %r9, %r12
  mov %r9, %r11
  mov %r9, %r8
mixadd256int11_loop:
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  add %r8, %r11
  add %r8, %r12
  add %r8, %r13
  add %r8, %r14
  add %r8, %r15
  vpaddq %ymm0, %ymm1, %ymm1
  vpaddq %ymm0, %ymm2, %ymm2
  vpaddq %ymm0, %ymm3, %ymm3
  vpaddq %ymm0, %ymm4, %ymm4
  vpaddq %ymm0, %ymm5, %ymm5
  sub %r9, %rdi
  jnz mixadd256int11_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latadd256int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
latadd256int_loop:
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  vpaddq %ymm0, %ymm0, %ymm0
  sub %r9, %rdi
  jnz latadd256int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latadd512int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastq %xmm1, %zmm0
  vmovdqa64 %zmm0, %zmm1
  vmovdqa64 %zmm0, %zmm2
  vmovdqa64 %zmm0, %zmm3
  vmovdqa64 %zmm0, %zmm4
  vmovdqa64 %zmm0, %zmm5
latadd51a2int_loop:
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  vpaddq %zmm0, %zmm0, %zmm0
  sub %r9, %rdi
  jnz latadd256int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latmul512int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastd %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
latmul512int_loop:
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  vpmulld %zmm0, %zmm0, %zmm0
  sub %r9, %rdi
  jnz latmul512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latmuldq512int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastd %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
latmuldq512int_loop:
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  vpmuldq %zmm0, %zmm0, %zmm0
  sub %r9, %rdi
  jnz latmuldq512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latmulq512int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  vpbroadcastd %xmm1, %zmm0
  vmovdqu64 %zmm0, %zmm1
  vmovdqu64 %zmm0, %zmm2
  vmovdqu64 %zmm0, %zmm3
  vmovdqu64 %zmm0, %zmm4
  vmovdqu64 %zmm0, %zmm5
latmulq512int_loop:
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  vpmullq %zmm0, %zmm0, %zmm0
  sub %r9, %rdi
  jnz latmulq512int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latmul256int:
  push %r9
  push %r8
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastd %xmm1, %ymm0
  vmovdqu %ymm0, %ymm1
  vmovdqu %ymm0, %ymm2
  vmovdqu %ymm0, %ymm3
  vmovdqu %ymm0, %ymm4
  vmovdqu %ymm0, %ymm5
latmul256int_loop:
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  vpmulld %ymm0, %ymm0, %ymm0
  sub %r9, %rdi
  jnz latmul256int_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r8
  pop %r9
  ret

latadd128int:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastq %xmm1, %xmm0
latadd128int_loop:
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  paddq %xmm0, %xmm0
  sub %r9, %rdi
  jnz latadd128int_loop
  movq %xmm1, %rax
  pop %r9
  ret

add128int:
  push %r9
  mov $16, %r9
  movq %r9, %xmm1
  //vpbroadcastq %xmm1, %xmm0
add128int_loop:
  paddq %xmm0, %xmm0
  paddq %xmm1, %xmm1
  paddq %xmm2, %xmm2
  paddq %xmm3, %xmm3
  paddq %xmm4, %xmm4
  paddq %xmm5, %xmm5
  paddq %xmm6, %xmm6
  paddq %xmm7, %xmm7
  paddq %xmm0, %xmm0
  paddq %xmm1, %xmm1
  paddq %xmm2, %xmm2
  paddq %xmm3, %xmm3
  paddq %xmm4, %xmm4
  paddq %xmm5, %xmm5
  paddq %xmm6, %xmm6
  paddq %xmm7, %xmm7 
  sub %r9, %rdi
  jg add128int_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesenc128:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  vzeroall
  pxor %xmm0, %xmm0
  pxor %xmm1, %xmm1
  pxor %xmm2, %xmm2
  pxor %xmm3, %xmm3
  pxor %xmm4, %xmm4
  pxor %xmm5, %xmm5
aesenc128_loop:
  aesenc %xmm0, %xmm1
  aesenc %xmm0, %xmm2
  aesenc %xmm0, %xmm3
  aesenc %xmm0, %xmm4
  aesenc %xmm0, %xmm5
  aesenc %xmm0, %xmm1
  aesenc %xmm0, %xmm2
  aesenc %xmm0, %xmm3
  aesenc %xmm0, %xmm4
  aesenc %xmm0, %xmm5
  aesenc %xmm0, %xmm1
  aesenc %xmm0, %xmm2
  aesenc %xmm0, %xmm3
  aesenc %xmm0, %xmm4
  aesenc %xmm0, %xmm5
  aesenc %xmm0, %xmm1
  aesenc %xmm0, %xmm2
  aesenc %xmm0, %xmm3
  aesenc %xmm0, %xmm4
  aesenc %xmm0, %xmm5
  sub %r9, %rdi
  jnz aesenc128_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesencadd128:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  vzeroall
  pxor %xmm0, %xmm0
  pxor %xmm1, %xmm1
  pxor %xmm2, %xmm2
  pxor %xmm3, %xmm3
  pxor %xmm4, %xmm4
  pxor %xmm5, %xmm5
  pxor %xmm6, %xmm6
  pxor %xmm7, %xmm7
  pxor %xmm8, %xmm8
  pxor %xmm9, %xmm9
  pxor %xmm10, %xmm10
  pxor %xmm11, %xmm11
  pxor %xmm12, %xmm12
  pxor %xmm13, %xmm13
aesencadd128_loop:
  aesenc %xmm0, %xmm1
  paddd %xmm6, %xmm2
  paddd %xmm6, %xmm3
  paddd %xmm6, %xmm4
  aesenc %xmm0, %xmm5
  paddd %xmm6, %xmm7
  paddd %xmm6, %xmm8
  paddd %xmm6, %xmm9
  aesenc %xmm0, %xmm10
  paddd %xmm6, %xmm2
  paddd %xmm6, %xmm3
  paddd %xmm6, %xmm4
  aesenc %xmm0, %xmm1
  paddd %xmm6, %xmm7
  paddd %xmm6, %xmm8
  paddd %xmm6, %xmm9
  aesenc %xmm0, %xmm10
  paddd %xmm6, %xmm11
  paddd %xmm6, %xmm12
  paddd %xmm6, %xmm13
  sub %r9, %rdi
  jnz aesencadd128_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesencfma128:
  push %r9
  mov $15, %r9
  movq %r9, %xmm1
  vzeroall
  pxor %xmm0, %xmm0
  pxor %xmm1, %xmm1
  xorps %xmm2, %xmm2
  xorps %xmm3, %xmm3
  xorps %xmm4, %xmm4
  pxor %xmm5, %xmm5
  xorps %xmm6, %xmm6
  xorps %xmm7, %xmm7
  xorps %xmm8, %xmm8
  xorps %xmm9, %xmm9
  pxor %xmm10, %xmm10
  xorps %xmm11, %xmm11
  xorps %xmm12, %xmm12
  xorps %xmm13, %xmm13
  xorps %xmm14, %xmm14
  xorps %xmm15, %xmm15
  vxorps %xmm16, %xmm16, %xmm16
  vxorps %xmm17, %xmm17, %xmm17
  vxorps %xmm18, %xmm18, %xmm18
  vxorps %xmm19, %xmm19, %xmm19
aesencfma128_loop:
  aesenc %xmm0, %xmm1
  vfmadd132ps %xmm6, %xmm2, %xmm2
  vfmadd132ps %xmm6, %xmm3, %xmm3
  aesenc %xmm0, %xmm5
  vfmadd132ps %xmm6, %xmm7, %xmm7
  vfmadd132ps %xmm6, %xmm8, %xmm8
  aesenc %xmm0, %xmm10
  vfmadd132ps %xmm6, %xmm11, %xmm11
  vfmadd132ps %xmm6, %xmm12, %xmm12
  aesenc %xmm0, %xmm1
  vfmadd132ps %xmm6, %xmm14, %xmm14
  vfmadd132ps %xmm6, %xmm15, %xmm15
  aesenc %xmm0, %xmm10
  vfmadd132ps %xmm6, %xmm17, %xmm17
  vfmadd132ps %xmm6, %xmm18, %xmm18
  sub %r9, %rdi
  jnz aesencfma128_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesencfadd128:
  push %r9
  mov $15, %r9
  movq %r9, %xmm1
  vzeroall
  pxor %xmm0, %xmm0
  pxor %xmm1, %xmm1
  xorps %xmm2, %xmm2
  xorps %xmm3, %xmm3
  xorps %xmm4, %xmm4
  pxor %xmm5, %xmm5
  xorps %xmm6, %xmm6
  xorps %xmm7, %xmm7
  xorps %xmm8, %xmm8
  xorps %xmm9, %xmm9
  pxor %xmm10, %xmm10
  xorps %xmm11, %xmm11
  xorps %xmm12, %xmm12
  xorps %xmm13, %xmm13
  xorps %xmm14, %xmm14
  xorps %xmm15, %xmm15
  vxorps %xmm16, %xmm16, %xmm16
  vxorps %xmm17, %xmm17, %xmm17
  vxorps %xmm18, %xmm18, %xmm18
  vxorps %xmm19, %xmm19, %xmm19
aesencfadd128_loop:
  aesenc %xmm0, %xmm1
  vaddps %xmm6, %xmm2, %xmm2
  vaddps %xmm6, %xmm3, %xmm3
  aesenc %xmm0, %xmm5
  vaddps %xmm6, %xmm7, %xmm7
  vaddps %xmm6, %xmm8, %xmm8
  aesenc %xmm0, %xmm10
  vaddps %xmm6, %xmm11, %xmm11
  vaddps %xmm6, %xmm12, %xmm12
  aesenc %xmm0, %xmm1
  vaddps %xmm6, %xmm14, %xmm14
  vaddps %xmm6, %xmm15, %xmm15
  aesenc %xmm0, %xmm10
  vaddps %xmm6, %xmm17, %xmm17
  vaddps %xmm6, %xmm18, %xmm18
  sub %r9, %rdi
  jg aesencfadd128_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesencmul128:
  push %r9
  mov $15, %r9
  vzeroall
  movq %r9, %xmm6
  pxor %xmm0, %xmm0
  pxor %xmm5, %xmm5
  pxor %xmm10, %xmm10
  xorps %xmm1, %xmm1
  xorps %xmm2, %xmm2
  xorps %xmm3, %xmm3
  xorps %xmm4, %xmm4
  xorps %xmm7, %xmm7
  xorps %xmm8, %xmm8
  xorps %xmm11, %xmm11
  xorps %xmm12, %xmm12
  xorps %xmm14, %xmm14
  xorps %xmm15, %xmm15
aesencmul128_loop:
  aesenc %xmm0, %xmm1
  pmullw %xmm6, %xmm2
  pmullw %xmm6, %xmm3
  aesenc %xmm0, %xmm5
  pmullw %xmm6, %xmm7
  pmullw %xmm6, %xmm8
  aesenc %xmm0, %xmm10
  pmullw %xmm6, %xmm11
  pmullw %xmm6, %xmm12
  aesenc %xmm0, %xmm1
  pmullw %xmm6, %xmm4
  pmullw %xmm6, %xmm6
  aesenc %xmm0, %xmm10
  pmullw %xmm6, %xmm13
  pmullw %xmm6, %xmm14
  sub %r9, %rdi
  jg aesencmul128_loop
  movq %xmm1, %rax
  pop %r9
  ret

aesdec128:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  vzeroall
  pxor %xmm0, %xmm0
  pxor %xmm1, %xmm1
  pxor %xmm2, %xmm2
  pxor %xmm3, %xmm3
  pxor %xmm4, %xmm4
  pxor %xmm5, %xmm5
aesdec128_loop:
  aesdec %xmm0, %xmm1
  aesdec %xmm0, %xmm2
  aesdec %xmm0, %xmm3
  aesdec %xmm0, %xmm4
  aesdec %xmm0, %xmm5
  aesdec %xmm0, %xmm1
  aesdec %xmm0, %xmm2
  aesdec %xmm0, %xmm3
  aesdec %xmm0, %xmm4
  aesdec %xmm0, %xmm5
  aesdec %xmm0, %xmm1
  aesdec %xmm0, %xmm2
  aesdec %xmm0, %xmm3
  aesdec %xmm0, %xmm4
  aesdec %xmm0, %xmm5
  aesdec %xmm0, %xmm1
  aesdec %xmm0, %xmm2
  aesdec %xmm0, %xmm3
  aesdec %xmm0, %xmm4
  aesdec %xmm0, %xmm5
  sub %r9, %rdi
  jnz aesdec128_loop
  movq %xmm1, %rax
  pop %r9
  ret

mul128int:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastd %xmm1, %xmm0
mul128int_loop:
  pmulld %xmm0, %xmm0
  pmulld %xmm1, %xmm1
  pmulld %xmm2, %xmm2
  pmulld %xmm3, %xmm3
  pmulld %xmm4, %xmm4
  pmulld %xmm0, %xmm0
  pmulld %xmm1, %xmm1
  pmulld %xmm2, %xmm2
  pmulld %xmm3, %xmm3
  pmulld %xmm4, %xmm4
  pmulld %xmm0, %xmm0
  pmulld %xmm1, %xmm1
  pmulld %xmm2, %xmm2
  pmulld %xmm3, %xmm3
  pmulld %xmm4, %xmm4
  pmulld %xmm0, %xmm0
  pmulld %xmm1, %xmm1
  pmulld %xmm2, %xmm2
  pmulld %xmm3, %xmm3
  pmulld %xmm4, %xmm4
  sub %r9, %rdi
  jnz mul128int_loop
  movq %xmm1, %rax
  pop %r9
  ret

latmul128int:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastd %xmm1, %xmm0
latmul128int_loop:
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  pmulld %xmm0, %xmm0
  sub %r9, %rdi
  jnz latmul128int_loop
  movq %xmm1, %rax
  pop %r9
  ret

mixaddmul128int:
  push %r9
  mov $20, %r9
  movq %r9, %xmm1
  //vpbroadcastd %xmm1, %xmm0
  movdqa %xmm0, %xmm1
  movdqa %xmm0, %xmm2
  movdqa %xmm0, %xmm3
  movdqa %xmm0, %xmm4
  movdqa %xmm0, %xmm5
  movdqa %xmm0, %xmm6
  movdqa %xmm0, %xmm7
  movdqa %xmm0, %xmm8
  movdqa %xmm0, %xmm9
  movdqa %xmm0, %xmm10
mixaddmul128int_loop:
  pmulld %xmm0, %xmm1
  paddd %xmm0, %xmm2
  pmulld %xmm0, %xmm3
  paddd %xmm0, %xmm4
  pmulld %xmm0, %xmm5
  paddd %xmm0, %xmm6
  pmulld %xmm0, %xmm7
  paddd %xmm0, %xmm8
  pmulld %xmm0, %xmm9
  paddd %xmm0, %xmm10
  pmulld %xmm0, %xmm1
  paddd %xmm0, %xmm2
  pmulld %xmm0, %xmm3
  paddd %xmm0, %xmm4
  pmulld %xmm0, %xmm5
  paddd %xmm0, %xmm6
  pmulld %xmm0, %xmm7
  paddd %xmm0, %xmm8
  pmulld %xmm0, %xmm9
  paddd %xmm0, %xmm10
  sub %r9, %rdi
  jnz mixaddmul128int_loop
  movq %xmm1, %rax
  pop %r9
  ret

latadd256fp:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
latadd256fp_loop:
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  vaddps %ymm6, %ymm6, %ymm6
  sub %r9, %rdi
  jnz latadd256fp_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mul256fp:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm0
  movups %xmm0, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
  vmovdqa %ymm0, %ymm1
  vmovdqa %ymm0, %ymm2
  vmovdqa %ymm0, %ymm3
  vmovdqa %ymm0, %ymm4
  vmovdqa %ymm0, %ymm5
  vmovdqa %ymm0, %ymm6
  vmovdqa %ymm0, %ymm7
  vmovdqa %ymm0, %ymm8
  vmovdqa %ymm0, %ymm9
  vmovdqa %ymm0, %ymm10
mul256fp_loop:
  vmulps %ymm0, %ymm1, %ymm1
  vmulps %ymm0, %ymm2, %ymm2
  vmulps %ymm0, %ymm3, %ymm3
  vmulps %ymm0, %ymm4, %ymm4
  vmulps %ymm0, %ymm5, %ymm5
  vmulps %ymm0, %ymm6, %ymm6
  vmulps %ymm0, %ymm7, %ymm7
  vmulps %ymm0, %ymm8, %ymm8
  vmulps %ymm0, %ymm9, %ymm9
  vmulps %ymm0, %ymm10, %ymm10
  vmulps %ymm0, %ymm1, %ymm1
  vmulps %ymm0, %ymm2, %ymm2
  vmulps %ymm0, %ymm3, %ymm3
  vmulps %ymm0, %ymm4, %ymm4
  vmulps %ymm0, %ymm5, %ymm5
  vmulps %ymm0, %ymm6, %ymm6
  vmulps %ymm0, %ymm7, %ymm7
  vmulps %ymm0, %ymm8, %ymm8
  vmulps %ymm0, %ymm9, %ymm9
  vmulps %ymm0, %ymm10, %ymm10
  sub %r9, %rdi
  jnz mul256fp_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

add256fp:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm0
  movups %xmm0, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
  vmovdqa %ymm0, %ymm1
  vmovdqa %ymm0, %ymm2
  vmovdqa %ymm0, %ymm3
  vmovdqa %ymm0, %ymm4
  vmovdqa %ymm0, %ymm5
  vmovdqa %ymm0, %ymm6
  vmovdqa %ymm0, %ymm7
  vmovdqa %ymm0, %ymm8
  vmovdqa %ymm0, %ymm9
  vmovdqa %ymm0, %ymm10
add256fp_loop:
  vaddps %ymm0, %ymm1, %ymm1
  vaddps %ymm0, %ymm2, %ymm2
  vaddps %ymm0, %ymm3, %ymm3
  vaddps %ymm0, %ymm4, %ymm4
  vaddps %ymm0, %ymm5, %ymm5
  vaddps %ymm0, %ymm6, %ymm6
  vaddps %ymm0, %ymm7, %ymm7
  vaddps %ymm0, %ymm8, %ymm8
  vaddps %ymm0, %ymm9, %ymm9
  vaddps %ymm0, %ymm10, %ymm10
  vaddps %ymm0, %ymm1, %ymm1
  vaddps %ymm0, %ymm2, %ymm2
  vaddps %ymm0, %ymm3, %ymm3
  vaddps %ymm0, %ymm4, %ymm4
  vaddps %ymm0, %ymm5, %ymm5
  vaddps %ymm0, %ymm6, %ymm6
  vaddps %ymm0, %ymm7, %ymm7
  vaddps %ymm0, %ymm8, %ymm8
  vaddps %ymm0, %ymm9, %ymm9
  vaddps %ymm0, %ymm10, %ymm10
  sub %r9, %rdi
  jnz add256fp_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret


latmul256fp:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
latmul256fp_loop:
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  vmulps %ymm6, %ymm6, %ymm6
  sub %r9, %rdi
  jnz latmul256fp_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fma512:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %zmm6
  vmovups %zmm6, %zmm5
  vmovups %zmm6, %zmm7
  vmovups %zmm6, %zmm8
  vmovups %zmm6, %zmm9
  vmovups %zmm6, %zmm10
  vmovups %zmm6, %zmm11
  vmovups %zmm6, %zmm12
  vmovups %zmm6, %zmm13
  vmovups %zmm6, %zmm14
  vmovups %zmm6, %zmm15
fma512_loop:
  vfmadd132ps %zmm6, %zmm5, %zmm5
  vfmadd132ps %zmm6, %zmm7, %zmm7
  vfmadd132ps %zmm6, %zmm8, %zmm8
  vfmadd132ps %zmm6, %zmm9, %zmm9
  vfmadd132ps %zmm6, %zmm10, %zmm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %zmm6, %zmm12, %zmm12
  vfmadd132ps %zmm6, %zmm13, %zmm13
  vfmadd132ps %zmm6, %zmm14, %zmm14
  vfmadd132ps %zmm6, %zmm15, %zmm15
  vfmadd132ps %zmm6, %zmm5, %zmm5
  vfmadd132ps %zmm6, %zmm7, %zmm7
  vfmadd132ps %zmm6, %zmm8, %zmm8
  vfmadd132ps %zmm6, %zmm9, %zmm9
  vfmadd132ps %zmm6, %zmm10, %zmm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %zmm6, %zmm12, %zmm12
  vfmadd132ps %zmm6, %zmm13, %zmm13
  vfmadd132ps %zmm6, %zmm14, %zmm14
  vfmadd132ps %zmm6, %zmm15, %zmm15
  sub %r9, %rdi
  jnz fma512_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mix21fma256fma512:
  push %r9
  push %r8
  mov $18, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %zmm6
  vmovups %zmm6, %zmm5
  vmovups %zmm6, %zmm7
  vmovups %zmm6, %zmm8
  vmovups %zmm6, %zmm9
  vmovups %zmm6, %zmm10
  vmovups %zmm6, %zmm11
  vmovups %zmm6, %zmm12
  vmovups %zmm6, %zmm13
  vmovups %zmm6, %zmm14
  vmovups %zmm6, %zmm15
mix21fma256fma512_loop:
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vfmadd132ps %zmm6, %zmm8, %zmm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %ymm6, %ymm13, %ymm13
  vfmadd132ps %zmm6, %zmm14, %zmm14
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vfmadd132ps %zmm6, %zmm8, %zmm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %ymm6, %ymm13, %ymm13
  vfmadd132ps %zmm6, %zmm14, %zmm14 
  sub %r9, %rdi
  jg mix21fma256fma512_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret 

mixfma256fma512:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %zmm6
  vmovups %zmm6, %zmm5
  vmovups %zmm6, %zmm7
  vmovups %zmm6, %zmm8
  vmovups %zmm6, %zmm9
  vmovups %zmm6, %zmm10
  vmovups %zmm6, %zmm11
  vmovups %zmm6, %zmm12
  vmovups %zmm6, %zmm13
  vmovups %zmm6, %zmm14
  vmovups %zmm6, %zmm15
mixfma256fma512_loop:
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %zmm6, %zmm7, %zmm7
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %zmm6, %zmm9, %zmm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %zmm6, %zmm13, %zmm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %zmm6, %zmm15, %zmm15
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %zmm6, %zmm7, %zmm7
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %zmm6, %zmm9, %zmm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %zmm6, %zmm11, %zmm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %zmm6, %zmm13, %zmm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %zmm6, %zmm15, %zmm15
  sub %r9, %rdi
  jnz mixfma256fma512_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fma256:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
  vmovups %ymm6, %ymm5
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm12
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm14
  vmovups %ymm6, %ymm15
fma256_loop:
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %ymm6, %ymm11, %ymm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %ymm6, %ymm13, %ymm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %ymm6, %ymm15, %ymm15
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vfmadd132ps %ymm6, %ymm10, %ymm10
  vfmadd132ps %ymm6, %ymm11, %ymm11
  vfmadd132ps %ymm6, %ymm12, %ymm12
  vfmadd132ps %ymm6, %ymm13, %ymm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %ymm6, %ymm15, %ymm15
  sub %r9, %rdi
  jnz fma256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fma128:
  push %r9
  push %r8
  vzeroupper
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -16(%rsp)
  vbroadcastss -16(%rsp), %xmm6
  vmovups %xmm6, %xmm5
  vmovups %xmm6, %xmm7
  vmovups %xmm6, %xmm8
  vmovups %xmm6, %xmm9
  vmovups %xmm6, %xmm10
  vmovups %xmm6, %xmm11
  vmovups %xmm6, %xmm12
  vmovups %xmm6, %xmm13
  vmovups %xmm6, %xmm14
  vmovups %xmm6, %xmm15
fma128_loop:
  vfmadd132ps %xmm6, %xmm5, %xmm5
  vfmadd132ps %xmm6, %xmm7, %xmm7
  vfmadd132ps %xmm6, %xmm8, %xmm8
  vfmadd132ps %xmm6, %xmm9, %xmm9
  vfmadd132ps %xmm6, %xmm10, %xmm10
  vfmadd132ps %xmm6, %xmm11, %xmm11
  vfmadd132ps %xmm6, %xmm12, %xmm12
  vfmadd132ps %xmm6, %xmm13, %xmm13
  vfmadd132ps %xmm6, %xmm14, %xmm14
  vfmadd132ps %xmm6, %xmm15, %xmm15
  vfmadd132ps %xmm6, %xmm5, %xmm5
  vfmadd132ps %xmm6, %xmm7, %xmm7
  vfmadd132ps %xmm6, %xmm8, %xmm8
  vfmadd132ps %xmm6, %xmm9, %xmm9
  vfmadd132ps %xmm6, %xmm10, %xmm10
  vfmadd132ps %xmm6, %xmm11, %xmm11
  vfmadd132ps %xmm6, %xmm12, %xmm12
  vfmadd132ps %xmm6, %xmm13, %xmm13
  vfmadd132ps %xmm6, %xmm14, %xmm14
  vfmadd132ps %xmm6, %xmm15, %xmm15
  sub %r9, %rdi
  jnz fma128_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmafadd256:
  push %r9
  push %r8
  mov $30, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %ymm6
  vmovups %ymm6, %ymm0
  vmovups %ymm6, %ymm1
  vmovups %ymm6, %ymm2
  vmovups %ymm6, %ymm3
  vmovups %ymm6, %ymm4
  vmovups %ymm6, %ymm5
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm12
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm14
  vmovups %ymm6, %ymm15
mixfmafadd256_loop:
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vaddps %ymm10, %ymm5, %ymm11
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vaddps %ymm12, %ymm5, %ymm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %ymm6, %ymm15, %ymm15
  vaddps %ymm12, %ymm6, %ymm13
  vfmadd132ps %ymm6, %ymm0, %ymm1
  vfmadd132ps %ymm6, %ymm2, %ymm3
  vaddps %ymm6, %ymm5, %ymm4
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vaddps %ymm10, %ymm6, %ymm11
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vaddps %ymm12, %ymm7, %ymm13
  vfmadd132ps %ymm6, %ymm14, %ymm14
  vfmadd132ps %ymm6, %ymm15, %ymm15
  vaddps %ymm12, %ymm5, %ymm13
  vfmadd132ps %ymm6, %ymm0, %ymm1
  vfmadd132ps %ymm6, %ymm2, %ymm3
  vaddps %ymm6, %ymm5, %ymm4
  vfmadd132ps %ymm6, %ymm5, %ymm5
  vfmadd132ps %ymm6, %ymm7, %ymm7
  vaddps %ymm10, %ymm6, %ymm11
  vfmadd132ps %ymm6, %ymm8, %ymm8
  vfmadd132ps %ymm6, %ymm9, %ymm9
  vaddps %ymm12, %ymm5, %ymm13
  sub %r9, %rdi
  jnz mixfmafadd256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmaadd512:
  push %r9
  push %r8
  mov $16, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %zmm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %zmm1
  vmovdqa64 %zmm0, %zmm3
  vmovdqa64 %zmm0, %zmm6
  vmovdqa64 %zmm0, %zmm9
  vmovdqa64 %zmm0, %zmm12
  vmovdqa64 %zmm0, %zmm15
  vmovaps %zmm1, %zmm2
  vmovaps %zmm1, %zmm4
  vmovaps %zmm1, %zmm5
  vmovaps %zmm1, %zmm7
  vmovaps %zmm1, %zmm8
  vmovaps %zmm1, %zmm10
  vmovaps %zmm1, %zmm11
  vmovaps %zmm1, %zmm13
  vmovaps %zmm1, %zmm14
mixfmaadd512_loop:
  vpaddq %zmm0, %zmm15, %zmm0
  vfmadd132ps %zmm1, %zmm1, %zmm1
  vfmadd132ps %zmm2, %zmm2, %zmm2
  vpaddq %zmm3, %zmm15, %zmm3
  vfmadd132ps %zmm4, %zmm4, %zmm4
  vfmadd132ps %zmm5, %zmm5, %zmm5
  vpaddq %zmm6, %zmm15, %zmm6
  vfmadd132ps %zmm7, %zmm7, %zmm7
  vfmadd132ps %zmm8, %zmm8, %zmm8
  vpaddq %zmm9, %zmm15, %zmm9
  vfmadd132ps %zmm10, %zmm10, %zmm10
  vfmadd132ps %zmm11, %zmm11, %zmm11
  vpaddq %zmm12, %zmm15, %zmm12
  vfmadd132ps %zmm13, %zmm13, %zmm13
  vfmadd132ps %zmm14, %zmm14, %zmm14
  sub %r9, %rdi
  jg mixfmaadd512_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfma512add256:
  push %r9
  push %r8
  mov $16, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %ymm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %zmm1
  vmovdqa %ymm0, %ymm3
  vmovdqa %ymm0, %ymm6
  vmovdqa %ymm0, %ymm9
  vmovdqa %ymm0, %ymm12
  vmovdqa %ymm0, %ymm15
  vmovaps %zmm1, %zmm2
  vmovaps %zmm1, %zmm4
  vmovaps %zmm1, %zmm5
  vmovaps %zmm1, %zmm7
  vmovaps %zmm1, %zmm8
  vmovaps %zmm1, %zmm10
  vmovaps %zmm1, %zmm11
  vmovaps %zmm1, %zmm13
  vmovaps %zmm1, %zmm14
mixfma512add256_loop:
  vpaddq %ymm0, %ymm15, %ymm0
  vfmadd132ps %zmm1, %zmm1, %zmm1
  vfmadd132ps %zmm2, %zmm2, %zmm2
  vpaddq %ymm3, %ymm15, %ymm3
  vfmadd132ps %zmm4, %zmm4, %zmm4
  vfmadd132ps %zmm5, %zmm5, %zmm5
  vpaddq %ymm6, %ymm15, %ymm6
  vfmadd132ps %zmm7, %zmm7, %zmm7
  vfmadd132ps %zmm8, %zmm8, %zmm8
  vpaddq %ymm9, %ymm15, %ymm9
  vfmadd132ps %zmm10, %zmm10, %zmm10
  vfmadd132ps %zmm11, %zmm11, %zmm11
  vpaddq %ymm12, %ymm15, %ymm12
  vfmadd132ps %zmm13, %zmm13, %zmm13
  vfmadd132ps %zmm14, %zmm14, %zmm14
  sub %r9, %rdi
  jg mixfma512add256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmaadd256:
  push %r9
  push %r8
  mov $16, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %ymm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %ymm1
  vmovdqa %ymm0, %ymm3
  vmovdqa %ymm0, %ymm6
  vmovdqa %ymm0, %ymm9
  vmovdqa %ymm0, %ymm12
  vmovdqa %ymm0, %ymm15
  vmovaps %ymm1, %ymm2
  vmovaps %ymm1, %ymm4
  vmovaps %ymm1, %ymm5
  vmovaps %ymm1, %ymm7
  vmovaps %ymm1, %ymm8
  vmovaps %ymm1, %ymm10
  vmovaps %ymm1, %ymm11
  vmovaps %ymm1, %ymm13
  vmovaps %ymm1, %ymm14
mixfmaadd256_loop:
  vpaddq %ymm0, %ymm15, %ymm0
  vfmadd132ps %ymm1, %ymm1, %ymm1
  vfmadd132ps %ymm2, %ymm2, %ymm2
  vpaddq %ymm3, %ymm15, %ymm3
  vfmadd132ps %ymm4, %ymm4, %ymm4
  vfmadd132ps %ymm5, %ymm5, %ymm5
  vpaddq %ymm6, %ymm15, %ymm6
  vfmadd132ps %ymm7, %ymm7, %ymm7
  vfmadd132ps %ymm8, %ymm8, %ymm8
  vpaddq %ymm9, %ymm15, %ymm9
  vfmadd132ps %ymm10, %ymm10, %ymm10
  vfmadd132ps %ymm11, %ymm11, %ymm11
  vpaddq %ymm12, %ymm15, %ymm12
  vfmadd132ps %ymm13, %ymm13, %ymm13
  vfmadd132ps %ymm14, %ymm14, %ymm14
  sub %r9, %rdi
  jg mixfmaadd256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmaand256:
  push %r9
  push %r8
  mov $15, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %ymm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %ymm1
  vmovdqa %ymm0, %ymm3
  vmovdqa %ymm0, %ymm6
  vmovdqa %ymm0, %ymm9
  vmovdqa %ymm0, %ymm12
  vmovdqa %ymm0, %ymm15
  vmovaps %ymm1, %ymm2
  vmovaps %ymm1, %ymm4
  vmovaps %ymm1, %ymm5
  vmovaps %ymm1, %ymm7
  vmovaps %ymm1, %ymm8
  vmovaps %ymm1, %ymm10
  vmovaps %ymm1, %ymm11
  vmovaps %ymm1, %ymm13
  vmovaps %ymm1, %ymm14
mixfmaand256_loop:
  vpand %ymm0, %ymm15, %ymm0
  vfmadd132ps %ymm1, %ymm1, %ymm1
  vfmadd132ps %ymm2, %ymm2, %ymm2
  vpand %ymm3, %ymm15, %ymm3
  vfmadd132ps %ymm4, %ymm4, %ymm4
  vfmadd132ps %ymm5, %ymm5, %ymm5
  vpand %ymm6, %ymm15, %ymm6
  vfmadd132ps %ymm7, %ymm7, %ymm7
  vfmadd132ps %ymm8, %ymm8, %ymm8
  vpand %ymm9, %ymm15, %ymm9
  vfmadd132ps %ymm10, %ymm10, %ymm10
  vfmadd132ps %ymm11, %ymm11, %ymm11
  vpand %ymm12, %ymm15, %ymm12
  vfmadd132ps %ymm13, %ymm13, %ymm13
  vfmadd132ps %ymm14, %ymm14, %ymm14
  sub %r9, %rdi
  jg mixfmaand256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmaandmem256:
  push %r9
  push %r8
  mov $22, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %ymm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %ymm1
  vmovdqa %ymm0, %ymm3
  vmovaps %ymm1, %ymm6
  vmovaps %ymm1, %ymm9
  vmovaps %ymm1, %ymm12
  vmovaps %ymm1, %ymm15
  vmovaps %ymm1, %ymm2
  vmovaps %ymm1, %ymm4
  vmovaps %ymm1, %ymm5
  vmovaps %ymm1, %ymm7
  vmovaps %ymm1, %ymm8
  vmovaps %ymm1, %ymm10
  vmovaps %ymm1, %ymm11
  vmovaps %ymm1, %ymm13
  vmovaps %ymm1, %ymm14
mixfmaandmem256_loop:
  vpand %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm1, %ymm1, %ymm1
  vfmadd132ps (%rsi), %ymm2, %ymm2
  vpand %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm4, %ymm4, %ymm4
  vfmadd132ps (%rsi), %ymm5, %ymm5
  vpand %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm7, %ymm7, %ymm7
  vfmadd132ps (%rsi), %ymm8, %ymm8
  vpand %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm10, %ymm10, %ymm10
  vfmadd132ps (%rsi), %ymm11, %ymm11
  vpand %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm13, %ymm13, %ymm13
  vfmadd132ps (%rsi), %ymm14, %ymm14

  vpand %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm6, %ymm6, %ymm6
  vfmadd132ps (%rsi), %ymm9, %ymm9
  vpand %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm12, %ymm12, %ymm12
  vfmadd132ps (%rsi), %ymm15, %ymm15
  sub %r9, %rdi
  jg mixfmaandmem256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

mixfmaaddmem256:
  push %r9
  push %r8
  mov $22, %r9
  movq %r9, %xmm0
  vpbroadcastq %xmm0, %ymm0
  cvtsi2ss %r9, %xmm1
  vbroadcastss %xmm1, %ymm1
  vmovdqa %ymm0, %ymm3
  vmovaps %ymm1, %ymm6
  vmovaps %ymm1, %ymm9
  vmovaps %ymm1, %ymm12
  vmovaps %ymm1, %ymm15
  vmovaps %ymm1, %ymm2
  vmovaps %ymm1, %ymm4
  vmovaps %ymm1, %ymm5
  vmovaps %ymm1, %ymm7
  vmovaps %ymm1, %ymm8
  vmovaps %ymm1, %ymm10
  vmovaps %ymm1, %ymm11
  vmovaps %ymm1, %ymm13
  vmovaps %ymm1, %ymm14
mixfmaaddmem256_loop:
  vpaddq %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm1, %ymm1, %ymm1
  vfmadd132ps (%rsi), %ymm2, %ymm2
  vpaddq %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm4, %ymm4, %ymm4
  vfmadd132ps (%rsi), %ymm5, %ymm5
  vpaddq %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm7, %ymm7, %ymm7
  vfmadd132ps (%rsi), %ymm8, %ymm8
  vpaddq %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm10, %ymm10, %ymm10
  vfmadd132ps (%rsi), %ymm11, %ymm11
  vpaddq %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm13, %ymm13, %ymm13
  vfmadd132ps (%rsi), %ymm14, %ymm14

  vpaddq %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm6, %ymm6, %ymm6
  vfmadd132ps (%rsi), %ymm9, %ymm9
  vpaddq %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm12, %ymm12, %ymm12
  vfmadd132ps (%rsi), %ymm15, %ymm15

  sub %r9, %rdi
  jg mixfmaaddmem256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

nemesfpu512mix21:
  push %r9
  mov $16, %r9
  cvtsi2ss %r9, %xmm0
  vbroadcastss %xmm0, %zmm1
  vmovdqa64 %zmm1, %zmm2
  vmovdqa64 %zmm1, %zmm3
  vmovdqa64 %zmm1, %zmm4
  vmovdqa64 %zmm1, %zmm5
  vmovdqa64 %zmm1, %zmm6
  vmovdqa64 %zmm1, %zmm7
  vmovdqa64 %zmm1, %zmm8
  vmovdqa64 %zmm1, %zmm9
  vmovdqa64 %zmm1, %zmm10
  vmovdqa64 %zmm1, %zmm11
  vmovdqa64 %zmm1, %zmm12
  vmovdqa64 %zmm1, %zmm13
  vmovdqa64 %zmm1, %zmm14
  vmovdqa64 %zmm1, %zmm15
nemesfpu512mix21_loop:
  vaddps %zmm0, %zmm0, %zmm0
  vfmadd132ps %zmm1, %zmm1, %zmm1
  vfmadd132ps %zmm2, %zmm2, %zmm2
  vaddps %zmm3, %zmm3, %zmm3
  vfmadd132ps %zmm4, %zmm4, %zmm4
  vfmadd132ps %zmm5, %zmm5, %zmm5
  vaddps %zmm6, %zmm6, %zmm6
  vfmadd132ps %zmm7, %zmm7, %zmm7
  vfmadd132ps %zmm8, %zmm8, %zmm8
  vaddps %zmm9, %zmm9, %zmm9
  vfmadd132ps %zmm10, %zmm10, %zmm10
  vfmadd132ps %zmm11, %zmm11, %zmm11
  vaddps %ymm12, %ymm12, %ymm12
  vfmadd132ps %zmm13, %zmm13, %zmm13
  vfmadd132ps %zmm14, %zmm14, %zmm14
  vaddps %zmm15, %zmm15, %zmm15
  sub %r9, %rdi
  jg nemesfpu512mix21_loop
  pop %r9
  ret

nemesfpumix21:
  push %r9
  mov $16, %r9
  cvtsi2ss %r9, %xmm0
  vbroadcastss %xmm0, %ymm1
  vmovdqa %ymm1, %ymm2
  vmovdqa %ymm1, %ymm3
  vmovdqa %ymm1, %ymm4
  vmovdqa %ymm1, %ymm5
  vmovdqa %ymm1, %ymm6
  vmovdqa %ymm1, %ymm7
  vmovdqa %ymm1, %ymm8
  vmovdqa %ymm1, %ymm9
  vmovdqa %ymm1, %ymm10
  vmovdqa %ymm1, %ymm11
  vmovdqa %ymm1, %ymm12
  vmovdqa %ymm1, %ymm13
  vmovdqa %ymm1, %ymm14
  vmovdqa %ymm1, %ymm15
nemesfpumix21_loop:
  vaddps %ymm0, %ymm0, %ymm0
  vfmadd132ps %ymm1, %ymm1, %ymm1
  vfmadd132ps %ymm2, %ymm2, %ymm2
  vaddps %ymm3, %ymm3, %ymm3
  vfmadd132ps %ymm4, %ymm4, %ymm4
  vfmadd132ps %ymm5, %ymm5, %ymm5
  vaddps %ymm6, %ymm6, %ymm6
  vfmadd132ps %ymm7, %ymm7, %ymm7
  vfmadd132ps %ymm8, %ymm8, %ymm8
  vaddps %ymm9, %ymm9, %ymm9
  vfmadd132ps %ymm10, %ymm10, %ymm10
  vfmadd132ps %ymm11, %ymm11, %ymm11
  vaddps %ymm12, %ymm12, %ymm12
  vfmadd132ps %ymm13, %ymm13, %ymm13
  vfmadd132ps %ymm14, %ymm14, %ymm14
  vaddps %ymm15, %ymm15, %ymm15
  sub %r9, %rdi
  jg nemesfpumix21_loop
  pop %r9
  ret

latfma512:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %zmm6
  vmovups %zmm6, %zmm5
  vmovups %zmm6, %zmm7
  vmovups %zmm6, %zmm8
  vmovups %zmm6, %zmm9
  vmovups %zmm6, %zmm10
  vmovups %zmm6, %zmm11
  vmovups %zmm6, %zmm12
  vmovups %zmm6, %zmm13
  vmovups %zmm6, %zmm14
  vmovups %zmm6, %zmm15
latfma512_loop:
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  vfmadd132ps %zmm6, %zmm5, %zmm7
  sub %r9, %rdi
  jnz latfma512_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

latfma256:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %ymm6
  vmovups %ymm6, %ymm5
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm12
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm14
  vmovups %ymm6, %ymm15
latfma256_loop:
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  vfmadd132ps %ymm6, %ymm5, %ymm7
  sub %r9, %rdi
  jnz latfma256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

latfma128:
  push %r9
  push %r8
  vzeroupper
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  vbroadcastss %xmm6, %xmm6
  vmovups %xmm6, %xmm5
  vmovups %xmm6, %xmm7
  vmovups %xmm6, %xmm8
  vmovups %xmm6, %xmm9
  vmovups %xmm6, %xmm10
  vmovups %xmm6, %xmm11
  vmovups %xmm6, %xmm12
  vmovups %xmm6, %xmm13
  vmovups %xmm6, %xmm14
  vmovups %xmm6, %xmm15
latfma128_loop:
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  vfmadd132ps %xmm6, %xmm5, %xmm7
  sub %r9, %rdi
  jnz latfma128_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret


latadd128fp:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  //vbroadcastss %xmm6, %xmm6
latadd128fp_loop:
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  addps %xmm6, %xmm6
  sub %r9, %rdi
  jnz latadd128fp_loop
  movq %xmm1, %rax
  pop %r8
  pop %r9
  ret

latmul128fp:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  //vbroadcastss %xmm6, %xmm6
latmul128fp_loop:
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  mulps %xmm6, %xmm6
  sub %r9, %rdi
  jnz latmul128fp_loop
  movq %xmm1, %rax
  pop %r8
  pop %r9
  ret

mul128fp:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm4
  cvtsi2ss %r9, %xmm3
  cvtsi2ss %r9, %xmm2
  cvtsi2ss %r9, %xmm1
  cvtsi2ss %r9, %xmm0
mul128fp_loop:
  mulps %xmm0, %xmm0
  mulps %xmm1, %xmm1
  mulps %xmm2, %xmm2
  mulps %xmm3, %xmm3
  mulps %xmm4, %xmm4
  mulps %xmm0, %xmm0
  mulps %xmm1, %xmm1
  mulps %xmm2, %xmm2
  mulps %xmm3, %xmm3
  mulps %xmm4, %xmm4
  mulps %xmm0, %xmm0
  mulps %xmm1, %xmm1
  mulps %xmm2, %xmm2
  mulps %xmm3, %xmm3
  mulps %xmm4, %xmm4
  mulps %xmm0, %xmm0
  mulps %xmm1, %xmm1
  mulps %xmm2, %xmm2
  mulps %xmm3, %xmm3
  mulps %xmm4, %xmm4
  sub %r9, %rdi
  jnz mul128fp_loop
  movq %xmm1, %rax
  pop %r8
  pop %r9
  ret

add128fp:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm4
  cvtsi2ss %r9, %xmm3
  cvtsi2ss %r9, %xmm2
  cvtsi2ss %r9, %xmm1
  cvtsi2ss %r9, %xmm0
add128fp_loop:
  addps %xmm0, %xmm0
  addps %xmm1, %xmm1
  addps %xmm2, %xmm2
  addps %xmm3, %xmm3
  addps %xmm4, %xmm4
  addps %xmm0, %xmm0
  addps %xmm1, %xmm1
  addps %xmm2, %xmm2
  addps %xmm3, %xmm3
  addps %xmm4, %xmm4
  addps %xmm0, %xmm0
  addps %xmm1, %xmm1
  addps %xmm2, %xmm2
  addps %xmm3, %xmm3
  addps %xmm4, %xmm4
  addps %xmm0, %xmm0
  addps %xmm1, %xmm1
  addps %xmm2, %xmm2
  addps %xmm3, %xmm3
  addps %xmm4, %xmm4
  sub %r9, %rdi
  jnz add128fp_loop
  movq %xmm1, %rax
  pop %r8
  pop %r9
  ret

latmul64:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
latmul64_loop:
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  imul %r9, %r15
  sub %r9, %rdi
  jnz latmul64_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

latmul16:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
latmul16_loop:
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  imul %r9w, %r15w
  sub %r9, %rdi
  jnz latmul16_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

mul16:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
mul16_loop:
  imul %r9w, %r15w
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9w, %r12w
  imul %r9w, %r11w
  imul %r9w, %r15w
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9w, %r12w
  imul %r9w, %r11w
  imul %r9w, %r15w
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9w, %r12w
  imul %r9w, %r11w
  imul %r9w, %r15w
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9w, %r12w
  imul %r9w, %r11w
  sub %r9, %rdi
  jnz mul16_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

mul64:
  push %rbx
  push %rcx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
mul64_loop:
  imul %r9, %r15
  mov %r9, %r15
  imul %r9, %r14
  mov %r9, %r14
  imul %r9, %r13
  mov %r9, %r13
  imul %r9, %r12
  mov %r9, %r12
  imul %r9, %r11
  mov %r9, %r11
  imul %r9, %r10
  mov %r9, %r10
  imul %r9, %r8
  mov %r9, %r8
  imul %r9, %rbx
  mov %r9, %rbx
  imul %r9, %rcx
  mov %r9, %rcx
  imul %r9, %rsi
  mov %r9, %rsi
  imul %r9, %r15
  mov %r9, %r15
  imul %r9, %r14
  mov %r9, %r14
  imul %r9, %r13
  mov %r9, %r13
  imul %r9, %r12
  mov %r9, %r12
  imul %r9, %r11
  mov %r9, %r11
  imul %r9, %r10
  mov %r9, %r10
  imul %r9, %r8
  mov %r9, %r8
  imul %r9, %rbx
  mov %r9, %rbx
  imul %r9, %rcx
  mov %r9, %rcx
  imul %r9, %rsi
  mov %r9, %rsi
  sub %r9, %rdi
  jnz mul64_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rcx
  pop %rbx
  ret

mixmul16mul64:
  push %rbx
  push %rcx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
mixmul16mul64_loop:
  imul %r9, %r15
  imul %r9w, %r14w
  imul %r9, %r13
  imul %r9w, %r12w
  imul %r9, %r11
  imul %r9w, %r10w
  imul %r9, %r8
  imul %r9w, %bx
  imul %r9, %rcx
  imul %r9w, %si
  imul %r9, %r15
  imul %r9w, %r14w
  imul %r9, %r13
  imul %r9w, %r12w
  imul %r9, %r11
  imul %r9w, %r10w
  imul %r9, %r8
  imul %r9w, %bx
  imul %r9, %rcx
  imul %r9w, %si
  sub %r9, %rdi
  jnz mixmul16mul64_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rcx
  pop %rbx
  ret

mixmul16mul64_21:
  push %rbx
  push %rcx
  push %rdx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $24, %r9
  mov %r8, %rbx
  mov %r8, %rcx
  mov %r8, %rsi
  mov %r8, %r10
  mov %r8, %r11
  mov %r8, %r12
  mov %r8, %r13
  mov %r8, %r14
  mov %r9, %r15
mixmul16mul64_21_loop:
  imul %r9, %r15
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9, %r12
  imul %r9w, %r11w
  imul %r9w, %r10w

  imul %r9, %r8
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9, %rcx
  imul %r9w, %r11w
  imul %r9w, %r10w

  imul %r9, %rbx
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9, %rax
  imul %r9w, %r11w
  imul %r9w, %r10w

  imul %r9, %rsi
  imul %r9w, %r14w
  imul %r9w, %r13w
  imul %r9, %rdx
  imul %r9w, %r11w
  imul %r9w, %r10w

  sub %r9, %rdi
  jge mixmul16mul64_21_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rdx
  pop %rcx
  pop %rbx
  ret

loadscalar:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $20, %r9
loadscalar_loop:
  mov (%rsi), %r15
  mov 8(%rsi), %r14
  mov 16(%rsi), %r13
  mov 24(%rsi), %r12
  mov 32(%rsi), %r11
  mov 40(%rsi), %r10
  mov 48(%rsi), %r15
  mov 56(%rsi), %r14
  mov 64(%rsi), %r13
  mov 72(%rsi), %r12 
  mov 80(%rsi), %r11
  mov 88(%rsi), %r10
  mov 96(%rsi), %r15
  mov 104(%rsi), %r14
  mov 112(%rsi), %r13  
  mov 120(%rsi), %r12
  mov 128(%rsi), %r11
  mov 136(%rsi), %r10
  mov 144(%rsi), %r15
  mov 152(%rsi), %r14   
  sub %r9, %rdi
  jnz loadscalar_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret 

spacedstorescalar:
  push %rbx
  push %rcx
  push %r8
  push %r9
  mov $20, %r9
spacedstorescalar_loop:
  mov %rdi, (%rsi)
  mov %rdi, 64(%rsi)
  mov %rdi, 128(%rsi)
  mov %rdi, 192(%rsi)
  mov %rdi, 256(%rsi)
  mov %rdi, 320(%rsi)
  mov %rdi, 384(%rsi)
  mov %rdi, 448(%rsi)
  mov %rdi, 512(%rsi)
  mov %rdi, 576(%rsi)
  mov %rdi, 640(%rsi)
  mov %rdi, 704(%rsi)
  mov %rdi, 768(%rsi)
  mov %rdi, 832(%rsi)
  mov %rdi, 896(%rsi)
  mov %rdi, 960(%rsi)
  mov %rdi, 1024(%rsi)
  mov %rdi, 1088(%rsi)
  mov %rdi, 1152(%rsi)
  mov %rdi, 1216(%rsi)
  sub %r9, %rdi
  jnz spacedstorescalar_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

mixedscalarloadstore:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $12, %r9
mixedscalarloadstore_loop:
  mov (%rsi), %r15
  mov 8(%rsi), %r14
  mov %r9, 400(%rsi)

  mov 16(%rsi), %r13
  mov 24(%rsi), %r12
  mov %r9, 408(%rsi)

  mov 32(%rsi), %r11
  mov 40(%rsi), %r10
  mov %r9, 416(%rsi)

  mov 48(%rsi), %r15
  mov 56(%rsi), %r14
  mov %r9, 424(%rsi)

  sub %r9, %rdi
  jg mixedscalarloadstore_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret


spacedload128:
  push %rbx
  push %rcx
  push %r8
  push %r9
  mov $20, %r9
spacedload128_loop:
  movdqa (%rsi), %xmm10
  movdqa 64(%rsi), %xmm11
  movdqa 128(%rsi), %xmm12
  movdqa 192(%rsi), %xmm13
  movdqa 256(%rsi), %xmm14
  movdqa 320(%rsi), %xmm10
  movdqa 384(%rsi), %xmm11
  movdqa 448(%rsi), %xmm12
  movdqa 512(%rsi), %xmm13
  movdqa 576(%rsi), %xmm14
  movdqa 640(%rsi), %xmm10
  movdqa 704(%rsi), %xmm11
  movdqa 768(%rsi), %xmm12
  movdqa 832(%rsi), %xmm13
  movdqa 896(%rsi), %xmm14
  movdqa 960(%rsi), %xmm10
  movdqa 1024(%rsi), %xmm11
  movdqa 1088(%rsi), %xmm12
  movdqa 1152(%rsi), %xmm13
  movdqa 1216(%rsi), %xmm14
  sub %r9, %rdi
  jnz spacedload128_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

load128:
  push %rbx
  push %rcx
  push %r8
  push %r9
  mov $20, %r9
load128_loop:
  movdqa (%rsi), %xmm10
  movdqa (%rsi), %xmm11
  movdqa (%rsi), %xmm12
  movdqa (%rsi), %xmm13
  movdqa (%rsi), %xmm14
  movdqa (%rsi), %xmm10
  movdqa (%rsi), %xmm11
  movdqa (%rsi), %xmm12
  movdqa (%rsi), %xmm13
  movdqa (%rsi), %xmm14
  movdqa (%rsi), %xmm10
  movdqa (%rsi), %xmm11
  movdqa (%rsi), %xmm12
  movdqa (%rsi), %xmm13
  movdqa (%rsi), %xmm14
  movdqa (%rsi), %xmm10
  movdqa (%rsi), %xmm11
  movdqa (%rsi), %xmm12
  movdqa (%rsi), %xmm13
  movdqa (%rsi), %xmm14
  sub %r9, %rdi
  jnz load128_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

load256:
  push %rbx
  push %rcx
  push %r8
  push %r9
  mov $20, %r9
load256_loop:
  vmovaps (%rsi), %ymm10
  vmovaps (%rsi), %ymm11
  vmovaps (%rsi), %ymm12
  vmovaps (%rsi), %ymm13
  vmovaps (%rsi), %ymm14
  vmovaps (%rsi), %ymm10
  vmovaps (%rsi), %ymm11
  vmovaps (%rsi), %ymm12
  vmovaps (%rsi), %ymm13
  vmovaps (%rsi), %ymm14
  vmovaps (%rsi), %ymm10
  vmovaps (%rsi), %ymm11
  vmovaps (%rsi), %ymm12
  vmovaps (%rsi), %ymm13
  vmovaps (%rsi), %ymm14
  vmovaps (%rsi), %ymm10
  vmovaps (%rsi), %ymm11
  vmovaps (%rsi), %ymm12
  vmovaps (%rsi), %ymm13
  vmovaps (%rsi), %ymm14
  sub %r9, %rdi
  jnz load256_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

load512:
  push %rbx
  push %rcx
  push %r8
  push %r9
  mov $20, %r9
load512_loop:
  vmovaps (%rsi), %zmm10
  vmovaps (%rsi), %zmm11
  vmovaps (%rsi), %zmm12
  vmovaps (%rsi), %zmm13
  vmovaps (%rsi), %zmm14
  vmovaps (%rsi), %zmm10
  vmovaps (%rsi), %zmm11
  vmovaps (%rsi), %zmm12
  vmovaps (%rsi), %zmm13
  vmovaps (%rsi), %zmm14
  vmovaps (%rsi), %zmm10
  vmovaps (%rsi), %zmm11
  vmovaps (%rsi), %zmm12
  vmovaps (%rsi), %zmm13
  vmovaps (%rsi), %zmm14
  vmovaps (%rsi), %zmm10
  vmovaps (%rsi), %zmm11
  vmovaps (%rsi), %zmm12
  vmovaps (%rsi), %zmm13
  vmovaps (%rsi), %zmm14
  sub %r9, %rdi
  jnz load512_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

store128:
  push %rbx
  push %rcx
  push %r8
  push %r9
  movdqa (%rsi), %xmm10
  movdqa %xmm10, %xmm11
  movdqa %xmm10, %xmm12
  movdqa %xmm10, %xmm13
  movdqa %xmm10, %xmm14
  mov $20, %r9
store128_loop:
  movdqa %xmm10, (%rdx)
  movdqa %xmm11, (%rdx)
  movdqa %xmm12, (%rdx)
  movdqa %xmm13, (%rdx)
  movdqa %xmm14, (%rdx)
  movdqa %xmm10, (%rdx)
  movdqa %xmm11, (%rdx)
  movdqa %xmm12, (%rdx)
  movdqa %xmm13, (%rdx)
  movdqa %xmm14, (%rdx)
  movdqa %xmm10, (%rdx)
  movdqa %xmm11, (%rdx)
  movdqa %xmm12, (%rdx)
  movdqa %xmm13, (%rdx)
  movdqa %xmm14, (%rdx)
  movdqa %xmm10, (%rdx)
  movdqa %xmm11, (%rdx)
  movdqa %xmm12, (%rdx)
  movdqa %xmm13, (%rdx)
  movdqa %xmm14, (%rdx)
  sub %r9, %rdi
  jnz store128_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

store256:
  push %rbx
  push %rcx
  push %r8
  push %r9
  vmovaps (%rsi), %ymm10
  vmovaps %ymm10, %ymm11
  vmovaps %ymm10, %ymm12
  vmovaps %ymm10, %ymm13
  vmovaps %ymm10, %ymm14
  mov $20, %r9
store256_loop:
  vmovaps %ymm10, (%rdx)
  vmovaps %ymm11, (%rdx)
  vmovaps %ymm12, (%rdx)
  vmovaps %ymm13, (%rdx)
  vmovaps %ymm14, (%rdx)
  vmovaps %ymm10, (%rdx)
  vmovaps %ymm11, (%rdx)
  vmovaps %ymm12, (%rdx)
  vmovaps %ymm13, (%rdx)
  vmovaps %ymm14, (%rdx)
  vmovaps %ymm10, (%rdx)
  vmovaps %ymm11, (%rdx)
  vmovaps %ymm12, (%rdx)
  vmovaps %ymm13, (%rdx)
  vmovaps %ymm14, (%rdx)
  vmovaps %ymm10, (%rdx)
  vmovaps %ymm11, (%rdx)
  vmovaps %ymm12, (%rdx)
  vmovaps %ymm13, (%rdx)
  vmovaps %ymm14, (%rdx)
  sub %r9, %rdi
  jnz store256_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

store512:
  push %rbx
  push %rcx
  push %r8
  push %r9
  vmovaps (%rsi), %zmm10
  vmovaps %zmm10, %zmm11
  vmovaps %zmm10, %zmm12
  vmovaps %zmm10, %zmm13
  vmovaps %zmm10, %zmm14
  mov $20, %r9
store512_loop:
  vmovaps %zmm10, (%rdx)
  vmovaps %zmm11, (%rdx)
  vmovaps %zmm12, (%rdx)
  vmovaps %zmm13, (%rdx)
  vmovaps %zmm14, (%rdx)
  vmovaps %zmm10, (%rdx)
  vmovaps %zmm11, (%rdx)
  vmovaps %zmm12, (%rdx)
  vmovaps %zmm13, (%rdx)
  vmovaps %zmm14, (%rdx)
  vmovaps %zmm10, (%rdx)
  vmovaps %zmm11, (%rdx)
  vmovaps %zmm12, (%rdx)
  vmovaps %zmm13, (%rdx)
  vmovaps %zmm14, (%rdx)
  vmovaps %zmm10, (%rdx)
  vmovaps %zmm11, (%rdx)
  vmovaps %zmm12, (%rdx)
  vmovaps %zmm13, (%rdx)
  vmovaps %zmm14, (%rdx)
  sub %r9, %rdi
  jnz store512_loop
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

pdeptest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
pdeptest_loop:
  pdep %r8, %r15, %r15
  pdep %r8, %r14, %r14
  pdep %r8, %r13, %r13
  pdep %r8, %r12, %r12
  pdep %r8, %r11, %r11
  pdep %r8, %r10, %r10
  pdep %r8, %rcx, %rcx
  pdep %r8, %rbx, %rbx
  pdep %r8, %r15, %r15
  pdep %r8, %r14, %r14
  pdep %r8, %r13, %r13
  pdep %r8, %r12, %r12
  pdep %r8, %r11, %r11
  pdep %r8, %r10, %r10
  pdep %r8, %rcx, %rcx
  pdep %r8, %rbx, %rbx
  pdep %r8, %r15, %r15
  pdep %r8, %r14, %r14
  pdep %r8, %r13, %r13
  pdep %r8, %r12, %r12
  sub %r9, %rdi
  jnz pdeptest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

pdepmultest:
  push %rbx
  push %rcx
  push %rsi
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %rsi, %rsi
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
pdepmultest_loop:
  pdep %r8, %r15, %r15
  imul %r9, %r14
  pdep %r8, %r13, %r13
  imul %r9, %r12
  pdep %r8, %r11, %r11
  imul %r9, %r10
  pdep %r8, %rcx, %rcx
  imul %r9, %rbx
  pdep %r8, %r15, %r15
  imul %r9, %rsi
  pdep %r8, %r15, %r15
  imul %r9, %r14
  pdep %r8, %r13, %r13
  imul %r9, %r12
  pdep %r8, %r11, %r11
  imul %r9, %r10
  pdep %r8, %rcx, %rcx
  imul %r9, %rbx
  pdep %r8, %r15, %r15
  imul %r9, %rsi
  sub %r9, %rdi
  jnz pdepmultest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rsi
  pop %rcx
  pop %rbx
  ret


pexttest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r10
  push %r11
  push %r12
  push %r13
  push %r14
  push %r15
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
  xor %rcx, %rcx
  xor %r10, %r10
  xor %r11, %r11
  xor %r12, %r12
  xor %r13, %r13
  xor %r14, %r14
  xor %r15, %r15
pexttest_loop:
  pext %r8, %r15, %r15
  pext %r8, %r14, %r14
  pext %r8, %r13, %r13
  pext %r8, %r12, %r12
  pext %r8, %r11, %r11
  pext %r8, %r10, %r10
  pext %r8, %rcx, %rcx
  pext %r8, %rbx, %rbx
  pext %r8, %r15, %r15
  pext %r8, %r14, %r14
  pext %r8, %r13, %r13
  pext %r8, %r12, %r12
  pext %r8, %r11, %r11
  pext %r8, %r10, %r10
  pext %r8, %rcx, %rcx
  pext %r8, %rbx, %rbx
  pext %r8, %r15, %r15
  pext %r8, %r14, %r14
  pext %r8, %r13, %r13
  pext %r8, %r12, %r12
  sub %r9, %rdi
  jnz pexttest_loop
  pop %r15
  pop %r14
  pop %r13
  pop %r12
  pop %r11
  pop %r10
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

depmovtest:
  push %rbx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
depmovtest_loop:
  mov %r15, %r12
  mov %r12, %r14
  mov %r14, %r13
  mov %r13, %r11
  mov %r11, %r15
  mov %r15, %r12
  mov %r12, %r14
  mov %r14, %r13
  mov %r13, %r11
  mov %r11, %r15
  mov %r15, %r12
  mov %r12, %r14
  mov %r14, %r13
  mov %r13, %r11
  mov %r11, %r15
  mov %r15, %r12
  mov %r12, %r14
  mov %r14, %r13
  mov %r13, %r11
  mov %r11, %r15
  sub %r9, %rdi
  jnz depmovtest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rbx
  ret

vecdepmovtest:
  push %rbx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm15
  xor %rbx, %rbx
vecdepmovtest_loop:
  movaps %xmm15, %xmm12
  movaps %xmm12, %xmm14
  movaps %xmm14, %xmm13
  movaps %xmm13, %xmm11
  movaps %xmm11, %xmm15
  movaps %xmm15, %xmm12
  movaps %xmm12, %xmm14
  movaps %xmm14, %xmm13
  movaps %xmm13, %xmm11
  movaps %xmm11, %xmm15
  movaps %xmm15, %xmm12
  movaps %xmm12, %xmm14
  movaps %xmm14, %xmm13
  movaps %xmm13, %xmm11
  movaps %xmm11, %xmm15
  movaps %xmm15, %xmm12
  movaps %xmm12, %xmm14
  movaps %xmm14, %xmm13
  movaps %xmm13, %xmm11
  movaps %xmm11, %xmm15
  sub %r9, %rdi
  jnz vecdepmovtest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rbx
  ret 

vecindepmovtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm0
  movaps %xmm0, %xmm1
  movaps %xmm0, %xmm10
  movaps %xmm0, %xmm11
  movaps %xmm0, %xmm12
  xor %rbx, %rbx
vecindepmovtest_loop:
  movaps %xmm10, %xmm15
  movaps %xmm11, %xmm14
  movaps %xmm12, %xmm13
  movaps %xmm0, %xmm15
  movaps %xmm1, %xmm14
  movaps %xmm10, %xmm15
  movaps %xmm11, %xmm14
  movaps %xmm12, %xmm13
  movaps %xmm0, %xmm15
  movaps %xmm1, %xmm14
  movaps %xmm10, %xmm15
  movaps %xmm11, %xmm14
  movaps %xmm12, %xmm13
  movaps %xmm0, %xmm15
  movaps %xmm1, %xmm14
  movaps %xmm10, %xmm15
  movaps %xmm11, %xmm14
  movaps %xmm12, %xmm13
  movaps %xmm0, %xmm15
  movaps %xmm1, %xmm14
  sub %r9, %rdi
  jnz vecindepmovtest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

indepmovtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
indepmovtest_loop:
  mov %r10, %r15
  mov %r11, %r14
  mov %r12, %r13
  mov %rax, %r15
  mov %rcx, %r14
  mov %r10, %r15
  mov %r11, %r14
  mov %r12, %r13
  mov %rax, %r15
  mov %rcx, %r14
  mov %r10, %r15
  mov %r11, %r14
  mov %r12, %r13
  mov %rax, %r15
  mov %rcx, %r14
  mov %r10, %r15
  mov %r11, %r14
  mov %r12, %r13
  mov %rax, %r15
  mov %rcx, %r14
  sub %r9, %rdi
  jnz indepmovtest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

movzerotest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
movzerotest_loop:
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  mov $0, %r15
  sub %r9, %rdi
  jnz movzerotest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

xorzerotest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
xorzerotest_loop:
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  xor %r15, %r15
  sub %r9, %rdi
  jnz xorzerotest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

vecxorzerotest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm0
  xor %rbx, %rbx
vecxorzerotest_loop:
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  xorps %xmm0, %xmm0
  sub %r9, %rdi
  jnz vecxorzerotest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret 

subzerotest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
subzerotest_loop:
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r15, %r15
  sub %r9, %rdi
  jnz subzerotest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

vecsubzerotest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm15
  xor %rbx, %rbx
vecsubzerotest_loop:
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  subss %xmm15, %xmm15
  sub %r9, %rdi
  jnz subzerotest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret 

depaddimmtest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
depaddimmtest_loop:
  add $1, %r15
  add $2, %r15
  add $3, %r15
  add $4, %r15
  add $5, %r15
  add $6, %r15
  add $7, %r15
  add $8, %r15
  add $9, %r15
  add $10, %r15
  add $11, %r15
  add $12, %r15
  add $13, %r15
  add $14, %r15
  add $15, %r15
  add $16, %r15
  add $17, %r15
  add $18, %r15
  add $19, %r15
  add $20, %r15
  sub %r9, %rdi
  jnz depaddimmtest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

memrenametest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $10, %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx 
memrenametest_loop:
  .rept 20
  mov %r10, (%rsi)
  mov (%rsi), %r10
  .endr
  sub %r9, %rdi
  jnz memrenametest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret 

depinctest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %rbx, %rbx
depinctest_loop:
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  inc %r15
  sub %r9, %rdi
  jnz depinctest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

depdectest:
  push %rbx
  push %rcx
  push %r8
  push %r9
  push %r15
  push %r14
  push %r13
  push %r12
  push %r11
  push %r10
  mov $1, %r8
  mov $20, %r9
  xor %r15, %r15
  not %r15
  xor %rbx, %rbx
depdectest_loop:
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  dec %r15
  sub %r9, %rdi
  jnz depdectest_loop
  pop %r10
  pop %r11
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  pop %r9
  pop %r8
  pop %rcx
  pop %rbx
  ret

/* FMA4 tests */
fma4_256:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %ymm6
  vmovups %ymm6, %ymm5
  vmovups %ymm6, %ymm7
  vmovups %ymm6, %ymm8
  vmovups %ymm6, %ymm9
  vmovups %ymm6, %ymm10
  vmovups %ymm6, %ymm11
  vmovups %ymm6, %ymm12
  vmovups %ymm6, %ymm13
  vmovups %ymm6, %ymm14
  vmovups %ymm6, %ymm15
fma4_256_loop:
  vfmaddps %ymm6, %ymm6, %ymm5, %ymm5
  vfmaddps %ymm6, %ymm6, %ymm7, %ymm7
  vfmaddps %ymm6, %ymm6, %ymm8, %ymm8
  vfmaddps %ymm6, %ymm6, %ymm9, %ymm9
  vfmaddps %ymm6, %ymm6, %ymm10, %ymm10
  vfmaddps %ymm6, %ymm6, %ymm11, %ymm11
  vfmaddps %ymm6, %ymm6, %ymm12, %ymm12
  vfmaddps %ymm6, %ymm6, %ymm13, %ymm13
  vfmaddps %ymm6, %ymm6, %ymm14, %ymm14
  vfmaddps %ymm6, %ymm6, %ymm15, %ymm15
  vfmaddps %ymm6, %ymm6, %ymm5, %ymm5
  vfmaddps %ymm6, %ymm6, %ymm7, %ymm7
  vfmaddps %ymm6, %ymm6, %ymm8, %ymm8
  vfmaddps %ymm6, %ymm6, %ymm9, %ymm9
  vfmaddps %ymm6, %ymm6, %ymm10, %ymm10
  vfmaddps %ymm6, %ymm6, %ymm11, %ymm11
  vfmaddps %ymm6, %ymm6, %ymm12, %ymm12
  vfmaddps %ymm6, %ymm6, %ymm13, %ymm13
  vfmaddps %ymm6, %ymm6, %ymm14, %ymm14
  vfmaddps %ymm6, %ymm6, %ymm15, %ymm15
  sub %r9, %rdi
  jnz fma4_256_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fma4_128:
  push %r9
  push %r8
  mov $20, %r9
  movq %r9, %xmm1
  cvtsi2ss %r9, %xmm6
  movups %xmm6, -32(%rsp)
  vbroadcastss -32(%rsp), %xmm6
  movups %xmm6, %xmm5
  movups %xmm6, %xmm7
  movups %xmm6, %xmm8
  movups %xmm6, %xmm9
  movups %xmm6, %xmm10
  movups %xmm6, %xmm11
  movups %xmm6, %xmm12
  movups %xmm6, %xmm13
  movups %xmm6, %xmm14
  movups %xmm6, %xmm15
fma4_128_loop:
  vfmaddps %xmm6, %xmm6, %xmm5, %xmm5
  vfmaddps %xmm6, %xmm6, %xmm7, %xmm7
  vfmaddps %xmm6, %xmm6, %xmm8, %xmm8
  vfmaddps %xmm6, %xmm6, %xmm9, %xmm9
  vfmaddps %xmm6, %xmm6, %xmm10, %xmm10
  vfmaddps %xmm6, %xmm6, %xmm11, %xmm11
  vfmaddps %xmm6, %xmm6, %xmm12, %xmm12
  vfmaddps %xmm6, %xmm6, %xmm13, %xmm13
  vfmaddps %xmm6, %xmm6, %xmm14, %xmm14
  vfmaddps %xmm6, %xmm6, %xmm15, %xmm15
  vfmaddps %xmm6, %xmm6, %xmm5, %xmm5
  vfmaddps %xmm6, %xmm6, %xmm7, %xmm7
  vfmaddps %xmm6, %xmm6, %xmm8, %xmm8
  vfmaddps %xmm6, %xmm6, %xmm9, %xmm9
  vfmaddps %xmm6, %xmm6, %xmm10, %xmm10
  vfmaddps %xmm6, %xmm6, %xmm11, %xmm11
  vfmaddps %xmm6, %xmm6, %xmm12, %xmm12
  vfmaddps %xmm6, %xmm6, %xmm13, %xmm13
  vfmaddps %xmm6, %xmm6, %xmm14, %xmm14
  vfmaddps %xmm6, %xmm6, %xmm15, %xmm15
  sub %r9, %rdi
  jnz fma4_128_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret


fdivtest:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm6
  movss %xmm6, %xmm5
  movss %xmm6, %xmm7
  movss %xmm6, %xmm8
  movss %xmm6, %xmm9
  movss %xmm6, %xmm10
  movss %xmm6, %xmm11
  movss %xmm6, %xmm12
  movss %xmm6, %xmm13
  movss %xmm6, %xmm14
  movss %xmm6, %xmm15
fdivtest_loop:
  divss %xmm6, %xmm5 
  divss %xmm6, %xmm7 
  divss %xmm6, %xmm8 
  divss %xmm6, %xmm9 
  divss %xmm6, %xmm10
  divss %xmm6, %xmm11
  divss %xmm6, %xmm12
  divss %xmm6, %xmm13
  divss %xmm6, %xmm14
  divss %xmm6, %xmm15
  divss %xmm6, %xmm5 
  divss %xmm6, %xmm7 
  divss %xmm6, %xmm8 
  divss %xmm6, %xmm9 
  divss %xmm6, %xmm10
  divss %xmm6, %xmm11
  divss %xmm6, %xmm12
  divss %xmm6, %xmm13
  divss %xmm6, %xmm14
  divss %xmm6, %xmm15
  sub %r9, %rdi
  jnz fdivtest_loop
  movq %xmm1, %rax
  pop %r8
  pop %r9
  ret 

fdivlattest:
  push %r9
  push %r8
  mov $20, %r9
  cvtsi2ss %r9, %xmm6
fdivlattest_loop:
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  divss %xmm6, %xmm6
  sub %r9, %rdi
  jnz fdivtest_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fmuldenormlattest:
  push %r9
  push %r8
  mov $0x00800000, %r9 /* smallest normal */
  mov $0x3f000000, %r8 /* 0.5 */
  movq %r9, %xmm6
  movq %r8, %xmm7
	mov $0x40000000, %r8 /* 2 */
	movq %r8, %xmm4
  mov $20, %r9
fmuldenormlattest_loop:
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm5, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  mulss %xmm4, %xmm6
  sub %r9, %rdi
  jnz fmuldenormlattest_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret

fmuldenormtest:
  push %r9
  push %r8
  mov $0x00800000, %r9
  mov $0x3e4ccccd, %r8
  movq %r9, %xmm6
  movq %r8, %xmm7
  movaps %xmm7, %xmm5
  mov $20, %r9
fmuldenormtest_loop:
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  mulss %xmm6, %xmm5
  movaps %xmm7, %xmm5
  sub %r9, %rdi
  jnz fmuldenormtest_loop
  movq %xmm1, %rax
  vzeroupper
  pop %r8
  pop %r9
  ret 

movqtoxmmtest:
  push %r9
  push %r8
  push %r10
  mov $20, %r9
  mov $123, %r10
movqtoxmmtest_loop:
  movq %r10, %xmm1
  movq %xmm1, %r10
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10 
  movq %r10, %xmm1
  movq %xmm1, %r10  
  sub %r9, %rdi
  jnz movqtoxmmtest_loop
  movq %xmm1, %rax
  pop %r10
  pop %r8
  pop %r9
  ret 


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.asm
================================================
section .text
bits 64

global asm_read

; rcx = ptr to array
; rdx = array length in bytes
; r8 = stop flag
; r9 = throttle factor
; return bytes read in rax
asm_read:
  push rdi
  push rsi
  push r10
  push r11
  mov rdi, rcx  ; save array base address
  xor rsi, rsi  ; index
  xor rax, rax  ; return value
asm_read_pass_loop:
  movups xmm0, [rdi]
  movups xmm0, [rdi + 16]
  movups xmm0, [rdi + 32]
  movups xmm0, [rdi + 48]
  movups xmm0, [rdi + 64]
  movups xmm0, [rdi + 80]
  movups xmm0, [rdi + 96]
  movups xmm0, [rdi + 112]

  add rdi, 128
  add rsi, 128    ; update index
  add rax, 128    ; update return value

  test r9, r9               ; need to throttle?
  jz asm_read_throttle_end
  mov r10, r9
asm_read_throttle:
  dec r10
  jnz asm_read_throttle;
asm_read_throttle_end:
  mov r10d, [r8]           ; check stop flag
  test r10d, r10d
  jnz asm_read_end

  cmp rdx, rsi             ; array len - index > 0?
  jg asm_read_pass_loop
  mov rdi, rcx             ; reset to start
  xor rsi, rsi             ; and reset index
  jmp asm_read_pass_loop
asm_read_end:
  pop r11
  pop r10
  pop rsi
  pop rdi
  ret

================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.cpp
================================================
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include <errno.h>
#include <sys/timeb.h>
#include <Windows.h>

#define CACHELINE_SIZE 64

struct BandwidthTestThreadData {
    uint64_t read_bytes;
    uint64_t arr_length_bytes;
    char* arr;
    volatile int* flag;
    HANDLE threadHandle;
};

struct LatencyTestData {
    uint32_t iterations;
    uint32_t* arr;
    float latency;
    HANDLE threadHandle;
};

extern "C" uint64_t asm_read(char* arr, uint64_t arr_length, volatile int* flag, int waitfactor);
DWORD ReadBandwidthTestThread(void* param);
DWORD FillBandwidthTestArr(void* param);
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment);
DWORD RunLatencyTest(void* param);
bool GetPrivilege();
float RunTest(uint64_t latencyAffinityMask, uint64_t bwAffinityMask, int bwThreadCount, int hugepages, float* measuredBw);

void StartMonitoring();
void EndMonitoring();
void SetupMonitoring();
void CloseMonitoring();

uint64_t BandwidthTestMemoryKB = 1048576 * 4;
uint64_t LatencyTestMemoryKB = 1048576;
uint64_t LatencyTestIterations = 1e5;
uint64_t throttle = 0;

int main(int argc, char* argv[]) {
    SYSTEM_INFO sysInfo;
    GetSystemInfo(&sysInfo);
    int bwThreadCap = sysInfo.dwNumberOfProcessors - 1;
    int coreCount = sysInfo.dwNumberOfProcessors;
    int latencyCore = 0;
    int* customCores = NULL;
    if (argc == 1) {
        fprintf(stderr, "Options:\n");
        fprintf(stderr, "-bwthreads [int]: Number of bandwidth test threads\n");
        fprintf(stderr, "-latencyaffinity [int]: Core to run latency test thread on\n");
        fprintf(stderr, "-bwcores [comma separated list]: Cores to run bandwidth load on\n");
        fprintf(stderr, "-scaleiterations [int]: Iterations scaling factor\n");
        fprintf(stderr, "-throttle [int]: Reduce bandwidth load per bandwidth test thread\n");
    }
    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (strncmp(arg, "bwthreads", 9) == 0) {
                argIdx++;
                bwThreadCap = atoi(argv[argIdx]);
                fprintf(stderr, "Using up to %d bw threads\n", bwThreadCap);
            }
            else if (strncmp(arg, "latencyaffinity", 15) == 0) {
                argIdx++;
                latencyCore = atoi(argv[argIdx]);
                fprintf(stderr, "Latency test thread will run in core %d\n", latencyCore);
            }
            else if (strncmp(arg, "scaleiterations", 15) == 0) {
                argIdx++;
                int scaleFactor = atoi(argv[argIdx]);
                LatencyTestIterations *= scaleFactor;
                fprintf(stderr, "Scaling iterations up by a factor of %d\n", scaleFactor);
            }
            else if (strncmp(arg, "throttle", 8) == 0) {
                argIdx++;
                throttle = atoi(argv[argIdx]);
                fprintf(stderr, "Pulling memory bandwidth test threads back, factor of %lld\n", throttle);
            }
            else if (strncmp(arg, "bwcores", 7) == 0) {
                argIdx++;
                char* customCoreListStr = argv[argIdx];
                bwThreadCap = 1;
                for (int i = 0; customCoreListStr[i] != 0; i++) {   // shell should null terminate this
                    if (customCoreListStr[i] == ',') {
                        bwThreadCap++;
                    }
                }

                customCores = (int*)malloc(sizeof(int) * bwThreadCap);
                memset(customCores, 0, sizeof(int) * bwThreadCap);
                int commaIdx = 1;
                for (int i = 0; customCoreListStr[i] != 0; i++) {
                    if (customCoreListStr[i] == ',') {
                        customCores[commaIdx] = i + 1;
                        commaIdx++;
                        customCoreListStr[i] = '\0';
                    }
                }

                fprintf(stderr, "Cores used for bandwidth load:");
                for (int i = 0; i < bwThreadCap; i++) {
                    customCores[i] = atoi(customCoreListStr + customCores[i]);
                    fprintf(stderr, " %d", customCores[i]);
                }

                fprintf(stderr, "\n");
            }
        }
    }

    GetPrivilege();
    //SetupMonitoring();

    uint64_t latencyAffinityMask = 1UL << latencyCore;
    uint64_t bwAffinityMask = 0;

    fprintf(stderr, "%d cores, will use up to %d for BW threads\n", coreCount, bwThreadCap);
    float* latencies = (float*)malloc(sizeof(float) * bwThreadCap + 1);
    float* bandwidths = (float*)malloc(sizeof(float) * bwThreadCap + 1);
    for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {
        float bw;
        int nextCore;
        if (bwThreadCount > 0) {
            if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;
            else nextCore = customCores[bwThreadCount - 1];
            fprintf(stderr, "next core is %d\n", nextCore);
            bwAffinityMask |= 1UL << nextCore;
        }

        float latencyNs = RunTest(latencyAffinityMask, bwAffinityMask, bwThreadCount, 1, &bw);
        fprintf(stderr, "%d bw threads %f GB/s %f ns\n", bwThreadCount, bw, latencyNs);
        latencies[bwThreadCount] = latencyNs;
        bandwidths[bwThreadCount] = bw;
    }

    printf("BW Threads, Bandwidth (GB/s), Latency (ns)\n");
    for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {
        printf("%d, %f, %f\n", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]);
    }

    free(latencies);
    free(bandwidths);
    if (customCores != NULL) free(customCores);
    //CloseMonitoring();
    return 0;
}

// returns latency in ns
// sets measuredBw = measured bandwidth
float RunTest(uint64_t latencyAffinity, uint64_t bwAffinity, int bwThreadCount, int hugepages, float* measuredBw) {
    uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024;
    volatile int flag = 0;  // set 1 to stop
    struct timeb start, end;
    int map_failed = 0;

    // MT bw test array fill
    struct BandwidthTestThreadData* bandwidthTestData = (struct BandwidthTestThreadData*)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);
    HANDLE* threadHandles = (HANDLE*)malloc(sizeof(HANDLE) * bwThreadCount);
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        bandwidthTestData[threadIdx].read_bytes = 0;
        bandwidthTestData[threadIdx].flag = &flag;
        bandwidthTestData[threadIdx].arr = (char*)malloc(perThreadArrSizeBytes);
        bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;
        threadHandles[threadIdx] = CreateThread(NULL, 0, FillBandwidthTestArr, bandwidthTestData + threadIdx, 0, NULL);
    }

    // set up latency test
    uint32_t* latencyArr;
    latencyArr = (uint32_t *)VirtualAlloc(NULL, LatencyTestMemoryKB * 1024, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
    if (latencyArr == NULL) {  // MAP_FAILED
        fprintf(stderr, "Failed to get memory via VirtualAlloc. Using plain malloc\n");
        latencyArr = (uint32_t *)malloc(LatencyTestMemoryKB * 1024);
        map_failed = 1;
    }

    struct LatencyTestData latencyTestData;
    latencyTestData.iterations = LatencyTestIterations;
    latencyTestData.latency = 0.0f;
    latencyTestData.arr = latencyArr;
    FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE);

    WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE);
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) threadHandles[threadIdx] = INVALID_HANDLE_VALUE;

    // create bw test threads
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++)
    {
        threadHandles[threadIdx] = CreateThread(NULL, 0, ReadBandwidthTestThread, bandwidthTestData + threadIdx, CREATE_SUSPENDED, NULL);
        SetThreadAffinityMask(threadHandles[threadIdx], bwAffinity);
    }

    //StartMonitoring();
    ftime(&start);
    // start bw test threads
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        ResumeThread(threadHandles[threadIdx]);
    }

    HANDLE latencyThreadHandle = CreateThread(NULL, 0, RunLatencyTest, (void*)&latencyTestData, CREATE_SUSPENDED, NULL);
    SetThreadAffinityMask(latencyThreadHandle, latencyAffinity);
    ResumeThread(latencyThreadHandle);
    WaitForSingleObject(latencyThreadHandle, INFINITE);
    flag = 1;

    WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE);
    ftime(&end);
    //EndMonitoring();

    // count on a cacheline basis even though the test only loads 4B at a time
    uint64_t latencyReadBytes = 64 * LatencyTestIterations;

    uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    float totalReadData = (float)latencyReadBytes;
    float bwReadBytes = 0.0f;
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        free(bandwidthTestData[threadIdx].arr);
        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;
        bwReadBytes += (float)bandwidthTestData[threadIdx].read_bytes;
    }

    *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms;
    float bwBandwidth = 1000 * (bwReadBytes / (float)1e9) / (float)time_diff_ms;
    float latencyBandwidth = 1000 * (latencyReadBytes / (float)1e9) / (float)time_diff_ms;

    fprintf(stderr, "%d bw threads - %f BW bandwidth, %f latency bandwidth\n", bwThreadCount, bwBandwidth, latencyBandwidth);

    free(bandwidthTestData);
    if (map_failed) free(latencyArr);
    else VirtualFree(latencyArr, 0, MEM_RELEASE);
    return latencyTestData.latency;
}

void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

// No need for simple addressing because this test should be operating well in DRAM
// where an extra cycle for indexed addressing should not make a big difference
// returns load to use latency in nanoseconds
// size_kb should be divisible by 2M, or whatever the hugepage size is
DWORD RunLatencyTest(void* param) {
    struct timeb start, end;
    struct LatencyTestData* testData = (struct LatencyTestData*)param;
    uint32_t* A = testData->arr;
    uint32_t iterations = testData->iterations;
    uint32_t sum = 0, current;

    // Run test
    ftime(&start);
    current = A[0];
    for (int i = 0; i < iterations; i++) {
        current = A[current];
        sum += current;
    }
    ftime(&end);
    uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    testData->latency = 1e6 * (float)time_diff_ms / (float)iterations;

    return sum;
}

DWORD FillBandwidthTestArr(void* param) {
    struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param;
    float* arr = (float*)bwTestData->arr;
    uint64_t float_elements = bwTestData->arr_length_bytes / 4;
    for (int i = 0; i < float_elements; i++) {
        arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f;
    }

    return 0;
}

DWORD ReadBandwidthTestThread(void* param) {
    struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param;
    uint64_t totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);
    bwTestData->read_bytes = totalDataBytes;
    return 0;
}

// For winring0
#define RDMSR_FUNCTION 0x821
#define WRMSR_FUNCTION 0x822
#define WINRING0_DEVICE_TYPE 40000
HANDLE driverHandle = INVALID_HANDLE_VALUE;

void SetupMonitoring() {
    driverHandle = CreateFileA("\\\\.\\WinRing0_1_2_0", FILE_SHARE_READ | FILE_SHARE_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
    if (driverHandle == INVALID_HANDLE_VALUE)
    {
        fprintf(stderr, "Could not open WinRing0 driver: %d\n", GetLastError());
    }
}

uint64_t ReadMsr(uint32_t index)
{
    uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (RDMSR_FUNCTION << 2);
    uint64_t rc;
    DWORD bytesReturned;
    if (!DeviceIoControl(driverHandle, code, &index, sizeof(uint32_t), &rc, sizeof(uint64_t), &bytesReturned, NULL))
    {
        fprintf(stderr, "ReadMsr failed (ioctl returned false)\n");
    }

    return rc;
}

void WriteMsr(uint32_t index, uint64_t value)
{
    uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (WRMSR_FUNCTION << 2);
    char inputBuffer[sizeof(uint32_t) + sizeof(uint64_t)];
    *(uint32_t*)inputBuffer = index;
    *(uint64_t*)(inputBuffer + sizeof(uint32_t)) = value;
    DWORD bytesReturned;
    if (!DeviceIoControl(driverHandle, code, &inputBuffer, sizeof(uint32_t) + sizeof(uint64_t), NULL, 0, &bytesReturned, NULL))
    {
        fprintf(stderr, "WriteMsr failed (ioctl returned false)\n");
    }
}

#define L3_PERF_CTL0 0xC0010230
#define L3_PERF_CTL1 0xC0010232
#define L3_PERF_CTL2 0xC0010234
#define L3_PERF_CTL3 0xC0010236
#define L3_PERF_CTR0 0xC0010231
#define L3_PERF_CTR1 0xC0010233
#define L3_PERF_CTR2 0xC0010235
#define L3_PERF_CTR3 0xC0010237

void ClearL3Counters() {
    WriteMsr(L3_PERF_CTR0, 0);
    WriteMsr(L3_PERF_CTR1, 0);
    WriteMsr(L3_PERF_CTR2, 0);
    WriteMsr(L3_PERF_CTR3, 0);
}

void StartMonitoring() {
    uint64_t l3access = 0x0300c0000040ff04;
    uint64_t l3miss = 0x0300c00000400104;
    uint64_t l3miss_sampled_dram_req = 0x0303c000004003ad;
    uint64_t l3miss_sampled_dram_req_latency = 0x0303c000004003ac;

    SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0
    WriteMsr(L3_PERF_CTL0, l3access);
    WriteMsr(L3_PERF_CTL1, l3miss);
    WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req);
    WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency);
    ClearL3Counters();

    SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1
    WriteMsr(L3_PERF_CTL0, l3access);
    WriteMsr(L3_PERF_CTL1, l3miss);
    WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req);
    WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency);
    ClearL3Counters();
}

void EndMonitoring() {
    SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0
    uint64_t ccd0L3Access = ReadMsr(L3_PERF_CTR0);
    uint64_t ccd0L3Miss = ReadMsr(L3_PERF_CTR1);
    uint64_t ccd0L3SampledDramReq = ReadMsr(L3_PERF_CTR2);
    uint64_t ccd0L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3);
    float ccd0SampledLatencyNs = 10.0f * ccd0L3SampledDramReqLatency / ccd0L3SampledDramReq;
    ClearL3Counters();

    SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1
    uint64_t ccd1L3Access = ReadMsr(L3_PERF_CTR0);
    uint64_t ccd1L3Miss = ReadMsr(L3_PERF_CTR1);
    uint64_t ccd1L3SampledDramReq = ReadMsr(L3_PERF_CTR2);
    uint64_t ccd1L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3);
    float ccd1SampledLatencyNs = 10.0f * ccd1L3SampledDramReqLatency / ccd1L3SampledDramReq;
    ClearL3Counters();

    fprintf(stderr, "CCD 0: %f ns, CCD1: %f ns\n", ccd0SampledLatencyNs, ccd1SampledLatencyNs);
}

void CloseMonitoring()
{
    if (driverHandle != INVALID_HANDLE_VALUE) CloseHandle(driverHandle);
    driverHandle = INVALID_HANDLE_VALUE;
}

bool GetPrivilege()
{
    HANDLE           hToken;
    TOKEN_PRIVILEGES tp;
    BOOL             status;
    DWORD            error;

    // open process token
    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
    {
        fprintf(stderr, "OpenProcessToken failed: %d\n", GetLastError());
        return false;
    }

    // get the luid
    if (!LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid))
    {
        fprintf(stderr, "Could not get luid: %d\n", GetLastError());
        return false;
    }

    // enable privilege
    tp.PrivilegeCount = 1;
    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
    status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);

    // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed.
    // So always check for the last error value.
    error = GetLastError();
    if (!status || (error != ERROR_SUCCESS))
    {
        fprintf(stderr, "AdjustTokenPrivileges failed with status %d, error %d\n", status, error);
        return false;
    }

    // close the handle
    if (!CloseHandle(hToken))
    {
        fprintf(stderr, "CloseHandle failed: %d\n", GetLastError());
        return false;
    }

    fprintf(stderr, "Got SeLockMemoryPrivilege\n");
}


================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.11.35327.3
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LoadedMemoryLatency", "LoadedMemoryLatency.vcxproj", "{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.ActiveCfg = Debug|x64
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.Build.0 = Debug|x64
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.ActiveCfg = Debug|Win32
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.Build.0 = Debug|Win32
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.ActiveCfg = Release|x64
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.Build.0 = Release|x64
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.ActiveCfg = Release|Win32
		{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {5656BCBF-7F82-471C-8AFE-1FE48AD34114}
	EndGlobalSection
EndGlobal


================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>17.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{e7b51ed8-5c4a-4cb5-9874-dc4b9caf056d}</ProjectGuid>
    <RootNamespace>LoadedMemoryLatency</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <CustomBuild Include="LoadedMemoryLatency.asm">
      <FileType>Document</FileType>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Running NASM</Message>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -f win64 LoadedMemoryLatency.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Running NASM</Message>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 LoadedMemoryLatency.asm</Command>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">LoadedMemoryLatency.obj</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">LoadedMemoryLatency.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="LoadedMemoryLatency.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="LoadedMemoryLatency.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="LoadedMemoryLatency.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
</Project>

================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency.c
================================================
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/sysinfo.h>
#include <sys/time.h>
#include <sched.h>
#include <pthread.h>
#include <math.h>
#include <errno.h>

#define CACHELINE_SIZE 64

enum TestMethod {
    Read,
    Add
};

struct BandwidthTestThreadData {
    uint64_t read_bytes;
    uint64_t arr_length_bytes;
    char *arr;
    volatile int *flag;
    cpu_set_t cpuset;
    pthread_t handle;
    enum TestMethod test_method;
};

struct LatencyTestData {
    uint32_t iterations;
    uint32_t *arr;
    float latency;
    cpu_set_t cpuset;
    pthread_t handle;
};

int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576 };

extern uint64_t asm_read(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); 
extern uint64_t asm_add(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); 
void *ReadBandwidthTestThread(void *param);
void *FillBandwidthTestArr(void *param);
void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment);
void *RunLatencyTest(void *param);
float RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw); 
float RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb);

uint64_t BandwidthTestMemoryKB = 1048576;
uint64_t LatencyTestMemoryKB = 1048576;
uint64_t LatencyTestIterations = 1e5;
uint64_t throttle = 0;
enum TestMethod testMethod = Read;

int main(int argc, char *argv[]) {
    int bwThreadCap = get_nprocs() - 1;
    int coreCount = get_nprocs();
    int latencyCore = 0;
    int *customCores = NULL;
    int sharedLatency = 0, bwonly = 0;
    if (argc == 1) {
        fprintf(stderr, "Options:\n");
        fprintf(stderr, "-bwthreads [int]: Number of bandwidth test threads\n");
        fprintf(stderr, "-latencyaffinity [int]: Core to run latency test thread on\n");
        fprintf(stderr, "-bwcores [comma separated list]: Cores to run bandwidth load on\n");
        fprintf(stderr, "-scaleiterations [int]: Iterations scaling factor\n");
        fprintf(stderr, "-throttle [int]: Reduce bandwidth load per bandwidth test thread\n");
    }
    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char *arg = argv[argIdx] + 1;
            if (strncmp(arg, "bwthreads", 9) == 0) {
                argIdx++;
                bwThreadCap = atoi(argv[argIdx]);
                fprintf(stderr, "Using up to %d bw threads\n", bwThreadCap);
            } else if (strncmp(arg, "latencyaffinity", 15) == 0) {
                argIdx++;
                latencyCore = atoi(argv[argIdx]);
                fprintf(stderr, "Latency test thread will run in core %d\n", latencyCore);
            } else if (strncmp(arg, "scaleiterations", 15) == 0) {
                argIdx++;
                int scaleFactor = atoi(argv[argIdx]);
                LatencyTestIterations *= scaleFactor;
                fprintf(stderr, "Scaling iterations up by a factor of %d\n", scaleFactor);
            } else if (strncmp(arg, "throttle", 8) == 0) {
                argIdx++;
                throttle = atol(argv[argIdx]);
                fprintf(stderr, "Pulling memory bandwidth test threads back, factor of %lu\n", throttle);
            } else if (strncmp(arg, "bwcores", 7) == 0) {
                argIdx++;
                char *customCoreListStr = argv[argIdx];
                bwThreadCap = 1;
                for (int i = 0; customCoreListStr[i] != 0; i++) {   // shell should null terminate this
                    if (customCoreListStr[i] == ',') {
                        bwThreadCap++;
                    }
                }

                customCores = (int *)malloc(sizeof(int) * bwThreadCap);
                memset(customCores, 0, sizeof(int) * bwThreadCap);
                int commaIdx = 1;
                for (int i = 0; customCoreListStr[i] != 0; i++) {
                    if (customCoreListStr[i] == ',') {
                        customCores[commaIdx] = i + 1;
                        commaIdx++;
                        customCoreListStr[i] = '\0';
                    }
                }

                fprintf(stderr, "Cores used for bandwidth load:");
                for (int i = 0; i < bwThreadCap; i++) {
                    customCores[i] = atoi(customCoreListStr + customCores[i]);
                    fprintf(stderr, " %d", customCores[i]);
                }

                fprintf(stderr, "\n");
            } else if (strncmp(arg, "sharedlatency", 13) == 0) {
                fprintf(stderr, "Shared arr bw+latency\n");
                sharedLatency = 1;
            } else if (strncmp(arg, "bwonly", 6) == 0) {
                fprintf(stderr, "Only testing bandwidth\n");
                bwonly = 1;
            } else if (strncmp(arg, "method", 6) == 0) {
                argIdx++;
                if (strncmp(argv[argIdx], "read", 4) == 0) {
                    testMethod = Read; 
                    fprintf(stderr, "Testing with reads\n");
                } else if (strncmp(argv[argIdx], "add", 3) == 0) {
                    testMethod = Add;
                    fprintf(stderr, "Testing with adds (RMW)\n");
                }
            }
        }
    }
        
    cpu_set_t latency_cpuset;
    CPU_ZERO(&latency_cpuset);
    CPU_SET(latencyCore, &latency_cpuset);
    
    cpu_set_t bw_cpuset;
    CPU_ZERO(&bw_cpuset);

    if (bwonly) {
        fprintf(stderr, "Only testing bandwidth to abuse the iteration logic\n");
        int testSizeCount = sizeof(default_test_sizes) / sizeof(int);
        float *bandwidths = (float *)malloc(sizeof(float) * testSizeCount);
        memset(bandwidths, 0, sizeof(float) * testSizeCount);
        // set the entire affinity mask right away
        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {
            int nextCore;
             if (bwThreadCount > 0) {
                if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;
                else nextCore = customCores[bwThreadCount - 1] ;
                fprintf(stderr, "next core is %d\n", nextCore);
                CPU_SET(nextCore, &bw_cpuset);
            } 
        }

        for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
            int testSizeKb = default_test_sizes[testSizeIdx];
            if (testSizeKb < bwThreadCap) {
                fprintf(stderr, "Skipping size %d because it's too small for specified thread count of %d\n", testSizeKb, bwThreadCap);
                continue;
            }

            float bandwidth = RunBandwidthOnlyTest(bw_cpuset, bwThreadCap, testSizeKb);
            bandwidths[testSizeIdx] = bandwidth;
            fprintf(stderr, "Test Size %d KB: %f GB/s\n", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]);
        }

        for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
            if (bandwidths[testSizeIdx] == 0.0f) continue;
            printf("%d,%f\n", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]);
        }
    }
    else if (!sharedLatency) {
        fprintf(stderr, "%d cores, will use up to %d for BW threads\n", coreCount, bwThreadCap);
        float *latencies = (float *)malloc(sizeof(float) * bwThreadCap + 1);
        float *bandwidths = (float *)malloc(sizeof(float) * bwThreadCap + 1);
        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {
            float bw;
            int nextCore;
            if (bwThreadCount > 0) {
                if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;
                else nextCore = customCores[bwThreadCount - 1] ;
                fprintf(stderr, "next core is %d\n", nextCore);
                CPU_SET(nextCore, &bw_cpuset);
            }

            if (nextCore < 0) break;

            // sharedlatency will always be false in this run mode
            float latencyNs = RunTest(latency_cpuset, bw_cpuset, bwThreadCount, 1, sharedLatency, &bw);
            fprintf(stderr, "%d bw threads %f GB/s %f ns\n", bwThreadCount, bw, latencyNs);
            latencies[bwThreadCount] = latencyNs;
            bandwidths[bwThreadCount] = bw;
        }

        printf("BW Threads, Bandwidth (GB/s), Latency (ns)\n");
        for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) {
            printf("%d, %f, %f\n", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]);
        }
        free(latencies);
        free(bandwidths);
    } else {
        int testSizeCount = sizeof(default_test_sizes) / sizeof(int);
        float *latencies = (float*)malloc(sizeof(float) * testSizeCount);
        float *bandwidths = (float*)malloc(sizeof(float) * testSizeCount);
        // set mask to all selected cores
        for (int bwThreadCount = 0; bwThreadCount < bwThreadCap; bwThreadCount++) {
            int nextCore;
            if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1;
            else nextCore = customCores[bwThreadCount];
            CPU_SET(nextCore, &bw_cpuset);
            fprintf(stderr, "Set core %d\n", nextCore);
        }

        for (int i = 0; i < testSizeCount; i++) {
            LatencyTestMemoryKB = default_test_sizes[i];
            latencies[i] = RunTest(latency_cpuset, bw_cpuset, bwThreadCap, 1, sharedLatency, bandwidths + i);
            fprintf(stderr, "%lu KB: %f ns %f GB/s\n", LatencyTestMemoryKB, latencies[i], bandwidths[i]);
        }

        printf("Test Size (KB), Latency (ns), Bandwidth (GB/s)\n");
        for (int i = 0; i < testSizeCount; i++) {
            printf("%d,%f,%f\n", default_test_sizes[i], latencies[i], bandwidths[i]);
        }

        free(latencies);
        free(bandwidths);
    }

    if (customCores != NULL) free(customCores);
    return 0;
}

// Caller ensures at least 1 KB per thread. Runs in private mode
float RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb) {
    volatile int flag = 0;
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;  
    struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);
    uint64_t perThreadArrSizeBytes = ceil((double)sizeKb / (double)bwThreadCount) * 1024;

    // Same initialization routine
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        bandwidthTestData[threadIdx].read_bytes = 0;
        bandwidthTestData[threadIdx].test_method = testMethod;
        bandwidthTestData[threadIdx].flag = &flag;
        bandwidthTestData[threadIdx].cpuset = bwAffinity;

        bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes);
        bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;
        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx));
    }

    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        pthread_join(bandwidthTestData[threadIdx].handle, NULL);
    } 

    // Run bandwidth threads for a few seconds and get results
    gettimeofday(&startTv, &startTz);
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx));
    }

    sleep(3);
    flag = 1;

    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        pthread_join(bandwidthTestData[threadIdx].handle, NULL);
    }
    
    gettimeofday(&endTv, &endTz);


    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float totalReadData = 0;
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        free(bandwidthTestData[threadIdx].arr);
        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;
    }

    float measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; 
    free(bandwidthTestData); 

    return measuredBw;
}
// returns latency in ns
// sets measuredBw = measured bandwidth
float RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw) {
    uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024;
    volatile int flag = 0;  // set 1 to stop
    struct timeval startTv, endTv;
    struct timezone startTz, endTz; 
    int map_failed = 0;

    // MT bw test array fill
    struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount);
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        bandwidthTestData[threadIdx].read_bytes = 0;
        bandwidthTestData[threadIdx].test_method = testMethod;
        bandwidthTestData[threadIdx].flag = &flag;
        bandwidthTestData[threadIdx].cpuset = bwAffinity;

        if (!sharedLatency) {
            bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes);
            bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes;
            pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx));
        }
    }

    // set up latency test
    uint32_t *latencyArr;
    latencyArr = mmap(NULL, LatencyTestMemoryKB * 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
    if (latencyArr == (void *)-1) {  // MAP_FAILED
        fprintf(stderr, "Failed to map hugepages arr, will use madvise\n");
        if (0 != posix_memalign((void **)(&latencyArr), 64, LatencyTestMemoryKB * 1024)) {
            fprintf(stderr, "Failed to allocate %lu KB of memory for latency test\n", LatencyTestMemoryKB);
            return 0.0f;
        }

        madvise(latencyArr, LatencyTestMemoryKB * 1024, MADV_HUGEPAGE);
        map_failed = 1;
    }

    struct LatencyTestData latencyTestData;
    latencyTestData.iterations = LatencyTestIterations;
    latencyTestData.latency = 0.0f;
    latencyTestData.cpuset = latencyAffinity;
    latencyTestData.arr = latencyArr;
    FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE);

    // let bw array fills finish
    for (int threadIdx = 0; threadIdx < bwThreadCount && !sharedLatency; threadIdx++) {
        pthread_join(bandwidthTestData[threadIdx].handle, NULL);
    }

    // use one array for all bw test threads. latency test size applies across bw threads
    if (sharedLatency) {
        for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
            bandwidthTestData[threadIdx].arr = (char *)latencyArr;
            bandwidthTestData[threadIdx].arr_length_bytes = LatencyTestMemoryKB * 1024;
        }
    }

    gettimeofday(&startTv, &startTz);
    // start bw test threads
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx));
    }

    pthread_create(&(latencyTestData.handle), NULL, RunLatencyTest, (void *)&latencyTestData); 
    pthread_join(latencyTestData.handle, NULL);
    flag = 1;

    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        pthread_join(bandwidthTestData[threadIdx].handle, NULL);
    }
    
    gettimeofday(&endTv, &endTz);

    // count on a cacheline basis even though the test only loads 4B at a time
    uint64_t latencyReadBytes = 64 * LatencyTestIterations;

    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float totalReadData = (float)latencyReadBytes;
    for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) {
        if (!sharedLatency) free(bandwidthTestData[threadIdx].arr);
        totalReadData += (float)bandwidthTestData[threadIdx].read_bytes;
    }

    *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; 

    free(bandwidthTestData);
    if (map_failed) free(latencyArr);
    else munmap(latencyArr, LatencyTestMemoryKB * 1024); 
    return latencyTestData.latency;
}

void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

// No need for simple addressing because this test should be operating well in DRAM
// where an extra cycle for indexed addressing should not make a big difference
// returns load to use latency in nanoseconds
// size_kb should be divisible by 2M, or whatever the hugepage size is
void *RunLatencyTest(void *param) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    struct LatencyTestData *testData = (struct LatencyTestData *)param;
    uint32_t *A = testData->arr;
    uint32_t iterations = testData->iterations;
    uint32_t sum = 0, current;

    // fucking affinity setting does not work
    int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(testData->cpuset));
    if (rc != 0) fprintf(stderr, "Latency thread failed to set affinity\n");

    // Run test
    gettimeofday(&startTv, &startTz);
    current = A[0];
    for (int i = 0; i < iterations; i++) {
        current = A[current];
        sum += current;
    }
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    testData->latency = 1e6 * (float)time_diff_ms / (float)iterations;

    if (sum == 0) printf("sum == 0 (?)\n");
}

void *FillBandwidthTestArr(void *param) {
    struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param;
    float *arr = (float *)bwTestData->arr;
    uint64_t float_elements = bwTestData->arr_length_bytes / 4;
    for (int i = 0; i < float_elements;i++) {
        arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f;
    }
}

void *ReadBandwidthTestThread(void *param) {
    struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param;
    int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(bwTestData->cpuset));
    if (rc != 0) {
        fprintf(stderr, "BW test thread failed to set affinity: %s\n", strerror(errno));
        for (int i = 0; i < 8; i++) {
            if (CPU_ISSET(i, &(bwTestData->cpuset))) fprintf(stderr, "\tCPU %d is set\n", i);
            else fprintf(stderr, "\tCPU %d is NOT set\n", i);
        }
    }

    uint64_t totalDataBytes;
    if (bwTestData->test_method == Read) totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);
    else if (bwTestData->test_method == Add) totalDataBytes = asm_add(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle);
    else fprintf(stderr, "Unsupported test method\n");
    bwTestData->read_bytes = totalDataBytes;
}


================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency_amd64.s
================================================
.global asm_read
.global asm_add

/* rcx = ptr to array
   rdx = arr length in bytes
   r8 = stop flag
   r9 = throttle factor
   return bytes read in rax
*/
asm_read:
  push %rdi
  push %rsi
  push %r10
  push %r11
  mov %rcx, %rdi
  xor %rsi, %rsi
  xor %rax, %rax
asm_read_pass_loop:
  /* load 128B */
  movups (%rdi), %xmm0
  movups 16(%rdi), %xmm0
  movups 32(%rdi), %xmm0
  movups 48(%rdi), %xmm0
  movups 64(%rdi), %xmm0
  movups 80(%rdi), %xmm0
  movups 96(%rdi), %xmm0
  movups 112(%rdi), %xmm0

  add $128, %rdi
  add $128, %rsi
  add $128, %rax

  test %r9, %r9
  jz asm_read_throttle_end
  mov %r9, %r10
asm_read_throttle:
  dec %r10
  jnz asm_read_throttle
asm_read_throttle_end:
  /* check stop flag */
  mov (%r8), %r10d
  test %r10d, %r10d
  jnz asm_read_end

  cmp %rsi, %rdx
  jg asm_read_pass_loop
  mov %rcx, %rdi
  xor %rsi, %rsi
  jmp asm_read_pass_loop
asm_read_end:
  pop %r11
  pop %r10
  pop %rsi
  pop %rdi
  ret

asm_add:
  push %rdi
  push %rsi
  push %r10
  push %r11
  mov %rcx, %rdi
  xor %rsi, %rsi
  xor %rax, %rax
  movups (%rdi), %xmm0
asm_add_pass_loop:
  /* load 128B */
  movups %xmm0, %xmm1
  addps (%rdi), %xmm1
  movups %xmm1, (%rdi)
  
  movups %xmm0, %xmm1
  addps 16(%rdi), %xmm1
  movups %xmm1, 16(%rdi)

  movups %xmm0, %xmm1
  addps 32(%rdi), %xmm1
  movups %xmm1, 32(%rdi)
  
  movups %xmm0, %xmm1
  addps 48(%rdi), %xmm1
  movups %xmm1, 32(%rdi)

  movups %xmm0, %xmm1
  addps 64(%rdi), %xmm1
  movups %xmm1, 64(%rdi)

  addps 80(%rdi), %xmm1
  addps 96(%rdi), %xmm1
  addps 112(%rdi), %xmm1

  add $128, %rdi
  add $128, %rsi
  add $128, %rax

  test %r9, %r9
  jz asm_add_throttle_end
  mov %r9, %r10
asm_add_throttle:
  dec %r10
  jnz asm_add_throttle
asm_add_throttle_end:
  /* check stop flag */
  mov (%r8), %r10d
  test %r10d, %r10d
  jnz asm_add_end

  cmp %rsi, %rdx
  jg asm_add_pass_loop
  mov %rcx, %rdi
  xor %rsi, %rsi
  jmp asm_add_pass_loop
asm_add_end:
  pop %r11
  pop %r10
  pop %rsi
  pop %rdi
  shl $1, %rax /* count rmw as 2 */
  ret 


================================================
FILE: LoadedMemoryLatency/LoadedMemoryLatency_arm.s
================================================
.global asm_read
.global _asm_read

.global asm_add
.global _asm_add

/* x0 = ptr to array
   x1 = arr length in bytes
   x2 = stop flag
   x3 = throttle factor
   return bytes read in x0
*/
_asm_read:
asm_read:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x11, x10, [sp, #0x30]
  sub x1, x1, 128
  mov x15, x0    /* ptr into array */
  mov x12, 0     /* current offset into array */
  mov x13, 0     /* data transferred in bytes */
asm_read_pass_loop:
  /* load 128B */
  ldr q16, [x15]
  ldr q16, [x15, 16]
  ldr q16, [x15, 32]
  ldr q16, [x15, 48]
  ldr q16, [x15, 64]
  ldr q16, [x15, 80]
  ldr q16, [x15, 96]
  ldr q16, [x15, 112]
  add x12, x12, 128
  add x15, x15, 128
  add x13, x13, 128

  cbz x3, asm_read_throttle_end
  mov x10, x3    /* save throttle factor */
asm_read_throttle:
  sub x10, x10, 1
  cbnz x10, asm_read_throttle
asm_read_throttle_end:  

  /* end condition */
  ldr w14, [x2]
  cbnz x14, asm_read_end

  /* loop back condition */
  cmp x1, x12
  b.gt asm_read_pass_loop
  mov x15, x0
  mov x12, 0
  b asm_read_pass_loop
asm_read_end:
  mov x0, x13
  ldp x11, x10, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret

_asm_add:
asm_add:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x11, x10, [sp, #0x30]
  sub x1, x1, 128
  mov x15, x0    /* ptr into array */
  mov x12, 0     /* current offset into array */
  mov x13, 0     /* data transferred in bytes */
  ldr q15, [x15]
asm_add_pass_loop:
  /* load 128B */
  ldr q16, [x15]
  
  ldr q16, [x15, 16]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 16]
  ldr q16, [x15, 32]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 32]
  ldr q16, [x15, 48]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 48]
  ldr q16, [x15, 64]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 64]
  ldr q16, [x15, 80]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 80]
  ldr q16, [x15, 96]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 96]
  ldr q16, [x15, 112]
  add v16.4s, v16.4s, v15.4s
  str q16, [x15, 112]
  add x12, x12, 128
  add x15, x15, 128
  add x13, x13, 256

  cbz x3, asm_add_throttle_end
  mov x10, x3    /* save throttle factor */
asm_add_throttle:
  sub x10, x10, 1
  cbnz x10, asm_add_throttle
asm_add_throttle_end:  

  /* end condition */
  ldr w14, [x2]
  cbnz x14, asm_add_end

  /* loop back condition */
  cmp x1, x12
  b.gt asm_add_pass_loop
  mov x15, x0
  mov x12, 0
  b asm_add_pass_loop
asm_add_end:
  mov x0, x13
  ldp x11, x10, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret 


================================================
FILE: LoadedMemoryLatency/Makefile
================================================
amd64:
	gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm
aarch64:
	gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm


================================================
FILE: Makefile
================================================
include Common/arch_detect.mk

COMPONENTS = CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency

all: $(COMPONENTS) 

ci:
	for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT ci; done

package:
	@sh Common/ci_package.sh

clean-package:
	find . -maxdepth 1 -type d -name "clammarks-*" -exec rm -rf {} \; && rm -f "clammarks.txz"

clean: 
	for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT clean; done

$(COMPONENTS): .FORCE
	$(MAKE) -C $@ 

.FORCE:

.PHONY: all ci package clean-package clean


================================================
FILE: MemoryBandwidth/Makefile
================================================
include ../Common/arch_detect.mk

CFLAGS = -pthread -O3
LDFLAGS= -lm

all: $(TARGET)

amd64:
	$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_amd64 $(LDFLAGS)

amd64-numa:
	$(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_numa_amd64 $(LDFLAGS) -lnuma

aarch64:
	$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 $(LDFLAGS)

termux:
	gcc -O3 -pthread MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 -lm

aarch64-numa:
	$(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_numa_aarch64 $(LDFLAGS) -lnuma

riscv64:
	$(CC) $(CFLAGS) -march=rv64gcv0p7 MemoryBandwidth.c MemoryBandwidth_riscv.s -o MemoryBandwidth_riscv64 $(LDFLAGS)

w64:
	$(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_w64.exe $(LDFLAGS)

ci: amd64 amd64-numa aarch64 w64

clean:
	rm -f *.o && find . -type f -executable -delete

.PHONY: all ci clean


================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.cpp
================================================
// MemoryBandwidth.cpp : This file contains the 'main' function. Program execution begins and ends there.
//

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#ifdef __MINGW32__
    #include <sys/timeb.h>
#else
    #include <sys\timeb.h>
#endif
#include <math.h>
#include <intrin.h>
#include <immintrin.h>
#include <windows.h>

#define NUMA_STRIPE 1
#define NUMA_SEQ 2
#define NUMA_CROSSNODE 3
#define NUMA_AUTO 4

#ifdef _WIN64
int default_test_sizes[39] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };
#else
int default_test_sizes[35] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288 };
#endif

enum NopType { None, FourByte, EightByte, K8_FourByte, Branch16, TenByte, LEA };

struct BandwidthTestThreadData {
    uint32_t iterations;
    uint32_t arr_length;
    float* arr;
    float bw; // written to by the thread
};

#ifdef _WIN64
uint32_t dataGb = 512;
#else
uint32_t dataGb = 96;
#endif
//__int32 dataGb = 32;

// array length = number of 4 byte elements
float _fastcall scalar_read(void* arr, uint32_t arr_length, uint32_t iterations);

#ifdef _WIN64
extern "C" float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float sse_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float sse_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float sse_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float sse_asm_add(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_cflip(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_add(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float repmovsb_copy(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float repstosb_write(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float clzero_asm_write(void* arr, uint64_t arr_length, uint64_t iterations);
float (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read;

#else
extern "C" float __fastcall scalar_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);
extern "C" float __fastcall mmx_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);
extern "C" float __fastcall sse_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations);
extern "C" float __fastcall dummy(void* arr, uint32_t arr_length, uint32_t iterations);
float(_fastcall *bw_func)(void*, uint32_t, uint32_t) = dummy;
#endif

float MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr);
float MeasureInstructionBw(uint64_t sizeKb, uint64_t iterations, enum NopType nopSize, uint32_t threads, int shared);
void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize);
#ifdef _WIN64
float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations);
#else
float __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations);
#endif

void PrintNumaInfo();
uint32_t GetIterationCount(uint32_t testSize, uint32_t threads);
DWORD WINAPI ReadBandwidthTestThread(LPVOID param);

int numa = 0;
char coreNode, memNode;
char GetSeqNode(uint64_t);
char GetStripeNode(uint64_t);

int main(int argc, char *argv[]) {
    int threads = 1, shared = 0, methodSet = 0;
    enum NopType instr = None;
    int cpuid_data[4];
    int singleSize = 0;

    if (argc == 1) {
        printf("Usage: [-threads <thread count>] [-method <scalar/sse/avx/asm_avx/asm_avx512>] [-shared] [-private] [-data <base GB to transfer, default = %d>]\n", dataGb);
    }

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "threads", 7) == 0) {
                argIdx++;
                threads = atoi(argv[argIdx]);
                fprintf(stderr, "Using %d threads\n", threads);
            }
            else if (_strnicmp(arg, "shared", 6) == 0) {
                shared = 1;
                fprintf(stderr, "Using one array shared across all threads\n");
            }
            else if (_strnicmp(arg, "private", 7) == 0) {
                shared = 0;
                fprintf(stderr, "Using private array for each thread\n");
            }
            else if (_strnicmp(arg, "method", 6) == 0) {
                methodSet = 1;
                argIdx++;
#ifdef _WIN64
                if (_strnicmp(argv[argIdx], "read_asm_sse", 7) == 0) {
                    bw_func = sse_asm_read;
                    fprintf(stderr, "Using SSE assembly\n");
                }
                else if (_strnicmp(argv[argIdx], "read_asm_avx512", 10) == 0) {
                    bw_func = avx512_asm_read;
                    fprintf(stderr, "Using AVX512 assembly\n");
                }
                else if (_strnicmp(argv[argIdx], "write_asm_avx", 14) == 0) {
                    bw_func = avx_asm_write;
                    fprintf(stderr, "Using AVX assembly, writing instead of reading\n");
                }
                else if (_strnicmp(argv[argIdx], "read_asm_avx", 12) == 0) {
                    bw_func = avx_asm_read;
                    fprintf(stderr, "Using AVX assembly\n");
                }
                else if (_strnicmp(argv[argIdx], "copy_asm_avx", 12) == 0) {
                    bw_func = avx_asm_copy;
                    fprintf(stderr, "Using AVX assembly, copying one half of array to the other\n");
                }
                else if (_strnicmp(argv[argIdx], "cflip_asm_avx", 13) == 0) {
                    bw_func = avx_asm_cflip;
                    fprintf(stderr, "Using AVX assembly, flipping order of vec sized elements within a cacheline\n");
                }
                else if (_strnicmp(argv[argIdx], "add_asm_avx", 11) == 0) {
                    bw_func = avx_asm_add;
                    fprintf(stderr, "Using AVX assembly, adding constant to array\n");
                }
                else if (_strnicmp(argv[argIdx], "copy_asm_sse", 12) == 0) {
                    bw_func = sse_asm_copy;
                    fprintf(stderr, "Using SSE assembly, copying one half of array to the other\n");
                }
                else if (_strnicmp(argv[argIdx], "write_asm_sse", 13) == 0) {
                    bw_func = sse_asm_write;
                    fprintf(stderr, "Using SSE assembly, writing\n");
                }
                else if (_strnicmp(argv[argIdx], "ntwrite_asm_sse", 13) == 0) {
                    bw_func = sse_asm_ntwrite;
                    fprintf(stderr, "Using SSE assembly, non-temporal writes\n");
                }
                else if (_strnicmp(argv[argIdx], "add_asm_sse", 11) == 0) {
                    bw_func = sse_asm_add;
                    fprintf(stderr, "Using SSE assembly, adding constant to array\n");
                }
                else if (_strnicmp(argv[argIdx], "copy_repmovsb", 11) == 0) {
                    bw_func = repmovsb_copy;
                    fprintf(stderr, "Using assembly, rep movsb to copy one half of the array to the other\n");
                }
                else if (_strnicmp(argv[argIdx], "write_repstosb", 11) == 0) {
                    bw_func = repstosb_write;
                    fprintf(stderr, "Using assembly, rep stosb to set array contents to 1\n");
                }
                else if (_strnicmp(argv[argIdx], "clzero", 11) == 0) {
                    bw_func = clzero_asm_write;
                    fprintf(stderr, "Using assembly, clzero to set array contents to 0\n");
                }
#else
                if (_strnicmp(argv[argIdx], "scalar", 6) == 0) {
                    bw_func = scalar_asm_read32;
                    fprintf(stderr, "Using scalar MOV r <- mem32\n");
                }
                else if (_strnicmp(argv[argIdx], "sse", 3) == 0) {
                    bw_func = sse_asm_read32;
                    fprintf(stderr, "Using SSE MOVAPS xmm <- mem128\n");
                }
                else if (_strnicmp(argv[argIdx], "mmx", 3) == 0) {
                    bw_func = mmx_asm_read32;
                    fprintf(stderr, "Using MMX MOVQ mm <- mem64\n");
                }
#endif
                else if (_strnicmp(argv[argIdx], "instr8", 6) == 0) {
                    instr = EightByte;
                    fprintf(stderr, "Using 8B NOPs\n");
                }
                else if (_strnicmp(argv[argIdx], "instr4", 6) == 0) {
                    instr = FourByte;
                    fprintf(stderr, "Using 4B NOPs\n");
                }
                else if (_strnicmp(argv[argIdx], "instrk8_4", 6) == 0) {
                    instr = K8_FourByte;
                    fprintf(stderr, "Using 4B NOPs, with encoding recommended in the Athlon optimization manual\n");
                }
                else if (_strnicmp(argv[argIdx], "instr_lea", 6) == 0) {
                    instr = LEA;
                    fprintf(stderr, "Using LEA\n");
                }
                else if (_strnicmp(argv[argIdx], "branch16", 6) == 0) {
                    instr = Branch16;
                    fprintf(stderr, "Using branch per 16B\n");
                }
                else if (_strnicmp(argv[argIdx], "instr10", 7) == 0)
                {
                    instr = TenByte;
                    fprintf(stderr, "Using 10B NOPs\n");
                }
                else {
                    methodSet = 0;
                    fprintf(stderr, "I'm so confused. Gonna use whatever the CPU supports I guess\n");
                }
            }
            else if (_strnicmp(arg, "data", 4) == 0) {
                argIdx++;
                dataGb = atoi(argv[argIdx]);
                fprintf(stderr, "Base data to transfer: %u\n", dataGb);
            }
            else if (_strnicmp(arg, "printnumainfo", 8) == 0) {
                fprintf(stderr, "Printing NUMA info and exiting\n");
                PrintNumaInfo();
                return 0;
            }
            else if (_strnicmp(arg, "numa", 4) == 0) {
                argIdx++;
                fprintf(stderr, "Attempting to be NUMA aware\n");
                numa = NUMA_SEQ;
                if (_strnicmp(argv[argIdx], "stripe", 6) == 0) {
                    numa = NUMA_STRIPE;
                }
                else if (_strnicmp(argv[argIdx], "seq", 3) == 0) {
                    numa = NUMA_SEQ;
                }

                if (numa == NUMA_SEQ) fprintf(stderr, "Filling NUMA nodes one by one\n");
                else if (numa == NUMA_STRIPE) fprintf(stderr, "Striping threads across NUMA nodes\n");
            }
            else if (_strnicmp(arg, "autonuma", 8) == 0) {
                numa = NUMA_AUTO;
            }
            else if (_strnicmp(arg, "crossnode", 9) == 0) {
                numa = NUMA_CROSSNODE;
                argIdx++;
                coreNode = atoi(argv[argIdx]);
                argIdx++;
                memNode = atoi(argv[argIdx]);
                fprintf(stderr, "Testing %d -> %d\n", coreNode, memNode);
            }
            else if (_strnicmp(arg, "singlesize", 10) == 0) {
                argIdx++;
                singleSize = atoi(argv[argIdx]);
                fprintf(stderr, "Testing %d KB\n", singleSize);
            }
        }
    }

    if (!methodSet) {
        // cpuid_data[0] = eax, [1] = ebx, [2] = ecx, [3] = edx
        __cpuidex(cpuid_data, 1, 0);
#ifdef _WIN64
        // EDX bit 25 = SSE
        if (cpuid_data[3] & (1UL << 25)) {
            fprintf(stderr, "SSE supported\n");
            bw_func = sse_asm_read;
        }

        if (cpuid_data[2] & (1UL << 28)) {
            fprintf(stderr, "AVX supported\n");
            bw_func = avx_asm_read;
        }

        __cpuidex(cpuid_data, 7, 0);
        if (cpuid_data[1] & (1UL << 16)) {
            fprintf(stderr, "AVX512 supported\n");
            bw_func = avx512_asm_read;
        }
#else
        int choice = 0;
        printf("Pick a method. Choose wisely:\n");
        printf("1. SSE movaps xmm <- mem128");
        if (cpuid_data[3] & (1UL << 25)) printf(" (looks supported)\n");
        else printf(" (looks unsupported)\n");

        printf("2. MMX movq mm <- mem64");
        if (cpuid_data[3] & (1UL << 23)) printf("  (looks supported)\n");
        else printf("  (looks unsupported\n");

        printf("3. mov gpr <- mem32 (better work)\n");
        printf("4. instruction side, 8B NOPs (0F 1F 84 00 00 00 00 00)\n");
        printf("5. instruction side, 4B NOPs (0F 1F 40 00)\n");
        printf("6. instruction side, 4B NOPs (66 66 66 90)\n");
        printf("Your choice: ");
        scanf_s("%d", &choice);
        if (choice == 1) bw_func = sse_asm_read32;
        else if (choice == 2) bw_func = mmx_asm_read32;
        else if (choice == 3) bw_func = scalar_asm_read32;
        else if (choice == 4) instr = EightByte;
        else if (choice == 5) instr = FourByte;
        else if (choice == 6) instr = K8_FourByte;
        else { printf("Bye\n"); return 0; }
#endif
    }

    if (instr) {
        bw_func = instr_read;
    }

    if (singleSize) {
        float bw = MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, instr);
        printf("%d,%f\n", singleSize, bw);
    }
    else if (numa == NUMA_AUTO) {
        ULONG highestNumaNode;
        if (!GetNumaHighestNodeNumber(&highestNumaNode)) {
            fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError());
            return 0;
        }

        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) printf(",%d", coreNode);
        printf("\n");

        for (int coreNodeIdx = 0; coreNodeIdx <= highestNumaNode; coreNodeIdx++) {
            printf("%d", coreNodeIdx);
            for (int memNodeIdx = 0; memNodeIdx <= highestNumaNode; memNodeIdx++) {
                ULONGLONG mask;
                DWORD index;
                coreNode = coreNodeIdx;
                memNode = memNodeIdx;
                numa = NUMA_CROSSNODE; // hacky, oh well
                float bw = MeasureBw(
                    default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1],
                    GetIterationCount(default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1], threads),
                    threads,
                    shared,
                    instr);
                printf(",%f", bw);
            }

            printf("\n");
        }
    }
    else {
        printf("Using %d threads\n", threads);
        for (int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++) {
            float bw = MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, instr);
            if (bw > 0) printf("%d,%f\n", default_test_sizes[i], bw);
        }
    }

    return 0;
}

/// <summary>
/// Given test size in KB, return a good iteration count
/// </summary>
/// <param name="testSize">test size in KB</param>
/// <returns>Iterations per thread</returns>
uint32_t GetIterationCount(uint32_t testSize, uint32_t threads)
{
    uint32_t gbToTransfer = dataGb;
    if (testSize > 64) gbToTransfer = dataGb / 2;
    if (testSize > 512) gbToTransfer = dataGb / 4;
    if (testSize > 8192) gbToTransfer = dataGb / 8;
    uint32_t iterations = gbToTransfer * 1024 * 1024 / testSize;
    if (iterations % 2 != 0) iterations += 1;

    if (iterations < 8) return 8; // set a minimum to reduce noise
    else return iterations;
}

float MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr) {
    struct timeb start, end;
    float bw = 0;
    uint32_t elements = sizeKb * 1024 / sizeof(float);
    uint32_t private_elements = ceil((double)sizeKb / (double)threads) * 256;
    DWORD protection_flags = PAGE_EXECUTE_READWRITE;

    //if (instr != None) protection_flags = PAGE_EXECUTE_READWRITE;
    if (!shared) elements = private_elements;

    //fprintf(stderr, "%llu elements per thread\n", elements);

    if (!shared && sizeKb < threads) {
        //fprintf(stderr, "Too many threads for this size, continuing\n");
        return 0;
    }

    // make array and fill it with something
    float* testArr = NULL;
    if (shared) {
        testArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);
        if (testArr == NULL) {
            fprintf(stderr, "Could not allocate memory\n");
            return 0;
        }

        if (instr != None)
        {
            FillInstructionArray((uint64_t*)testArr, sizeKb, instr);
        }
        else {
            for (uint32_t i = 0; i < elements; i++) {
                testArr[i] = i + 0.5f;
            }
        }
    }

    HANDLE* testThreads = (HANDLE*)malloc(threads * sizeof(HANDLE));
    DWORD* tids = (DWORD*)malloc(threads * sizeof(DWORD));
    struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData));

    for (uint64_t i = 0; i < threads; i++) {
        char node;
        if (shared) {
            threadData[i].arr = testArr;
            threadData[i].iterations = iterations;
        }
        else {
            if (!numa) threadData[i].arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);
            else if (numa == NUMA_STRIPE) {
                node = GetStripeNode(i);
                threadData[i].arr = (float *)VirtualAllocExNuma(
                    GetCurrentProcess(),
                    NULL,
                    elements * sizeof(float),
                    MEM_RESERVE | MEM_COMMIT,
                    protection_flags,
                    node
                );
            }
            else if (numa == NUMA_SEQ) {
                node = GetSeqNode(i);
                threadData[i].arr = (float*)VirtualAllocExNuma(
                    GetCurrentProcess(),
                    NULL,
                    elements * sizeof(float),
                    MEM_RESERVE | MEM_COMMIT,
                    protection_flags,
                    node
                );
            }
            else if (numa == NUMA_CROSSNODE) {
                threadData[i].arr = (float*)VirtualAllocExNuma(
                    GetCurrentProcess(),
                    NULL,
                    elements * sizeof(float),
                    MEM_RESERVE | MEM_COMMIT,
                    protection_flags,
                    memNode
                );

                node = memNode;
            }

            if (threadData[i].arr == NULL) {
                fprintf(stderr, "Could not allocate memory for thread %llu\n", i);
                return 0;
            }

            if (instr != None)
            {
                FillInstructionArray((uint64_t*)threadData[i].arr, (elements * 4) / 1024, instr);
            }
            else
            {
                for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {
                    threadData[i].arr[arr_idx] = arr_idx + i + 0.5f;
                }
            }

            threadData[i].iterations = iterations * threads;
        }

        threadData[i].arr_length = elements;
        threadData[i].bw = 0;
        testThreads[i] = CreateThread(NULL, 0, ReadBandwidthTestThread, threadData + i, CREATE_SUSPENDED, tids + i);

        // turns out setting affinity makes no difference, and it's easier to set affinity via start /affinity <mask> anyway
        //SetThreadAffinityMask(testThreads[i], 1UL << i);
        if (numa == NUMA_STRIPE || numa == NUMA_SEQ) {
            ULONGLONG mask;
            //fprintf(stderr, "Thread %d pinned to node %d\n", i, node);
            GetNumaNodeProcessorMask(node, &mask);
            SetThreadAffinityMask(testThreads[i], mask);
        }
        else if (numa == NUMA_CROSSNODE) {
            ULONGLONG mask;
            GetNumaNodeProcessorMask(coreNode, &mask);
            SetThreadAffinityMask(testThreads[i], mask);
        }
    }

    ftime(&start);
    for (uint32_t i = 0; i < threads; i++) ResumeThread(testThreads[i]);
    WaitForMultipleObjects((DWORD)threads, testThreads, TRUE, INFINITE);
    ftime(&end);

    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    double gbTransferred = (uint64_t)iterations * sizeof(float) * elements * threads / (double)1e9;
    bw = 1000 * gbTransferred / (double)time_diff_ms;
    if (!shared) bw = bw * threads;
    //printf("%u iterations\n", iterations);
    //printf("%f GB, %lu ms\n", gbTransferred, time_diff_ms);

    free(testThreads);
    if (shared) VirtualFree(testArr, elements * sizeof(float), MEM_RELEASE);
    free(tids);

    if (!shared) {
        for (int i = 0; i < threads; i++) {
            VirtualFreeEx(GetCurrentProcess(), threadData[i].arr, 0, MEM_RELEASE);
        }
    }

    free(threadData);
    return bw;
}

void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize)
{
    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };

    // zen/piledriver optimization manual uses this pattern
    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };

    // athlon64 (K8) optimization manual pattern
    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };

    char lea[8] = { 0x48, 0x8D, 0x04, 0x4B, 0x66, 0x0F, 0xEF, 0xC0 };

    char nop10b[10] = { 0x66, 0x66, 0xf, 0x1f, 0x84, 0, 0, 0, 0, 0 };

    uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret
    unsigned char* functionEnd = (unsigned char*)(arr + elements);

    if (nopSize != Branch16 && nopSize != TenByte) {
        uint64_t* nopPtr;
        if (nopSize == EightByte) nopPtr = (uint64_t*)(nop8b);
        else if (nopSize == FourByte) nopPtr = (uint64_t*)(nop4b);
        else if (nopSize == K8_FourByte) nopPtr = (uint64_t*)(k8_nop4b);
        else if (nopSize == LEA) nopPtr = (uint64_t*)(lea);
        else {
            fprintf(stderr, "%d (enum value) NOP size isn't supported :(\n", nopSize);
            return;
        }

        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {
            arr[nopIdx] = *nopPtr;
        }

        functionEnd[0] = 0xC3;
    }
    else if (nopSize == TenByte) {
        char* targetArr = (char*)arr;
        uint64_t targetArrLenBytes = sizeKb * 1024 - 2; // leave room for ret
        int targetArrIdx;
        for (targetArrIdx = 0; targetArrIdx + 10 < targetArrLenBytes; targetArrIdx += 10)
        {
            memcpy(targetArr + targetArrIdx, nop10b, 10);
        }

        targetArr[targetArrIdx] = 0xC3;
    }
    else if (nopSize == Branch16) {
        // jump forward 14 bytes
        char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
        char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 };
        uint64_t *branchPtr = (uint64_t*)(branch16b);
        uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this

        // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement
        // i.e. branchElements for loop must be even, so the last iteration is odd
        uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1;
        uint64_t nopIdx;
        for (nopIdx = 0; nopIdx < branchElements; nopIdx++) {
            arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr;
        }

        arr[nopIdx] = *(uint64_t*)ret8b;
    }
}

#ifdef _WIN64
float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations)
#else
float __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations)
#endif
{
    void (*nopfunc)(uint64_t);
    nopfunc = (void(*)(uint64_t))arr;
    int iterIdx;
    for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);
    return iterIdx;
}

float __fastcall scalar_read(void* a, uint32_t arr_length, uint32_t iterations)  {
    float sum = 0;
    if (16 >= arr_length) return 0;

    uint32_t iter_idx = 0, i = 0;
    float s1 = 0, s2 = 1, s3 = 0, s4 = 1, s5 = 0, s6 = 1, s7 = 0, s8 = 1;
    float* arr = (float*)a;
    while (iter_idx < iterations) {
        s1 += arr[i];
        s2 *= arr[i + 1];
        s3 += arr[i + 2];
        s4 *= arr[i + 3];
        s5 += arr[i + 4];
        s6 *= arr[i + 5];
        s7 += arr[i + 6];
        s8 *= arr[i + 7];
        i += 8;
        if (i + 7 >= arr_length) i = 0;
        if (i == 0) iter_idx++;
    }

    sum += s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;

    return sum;
}

// return sum of array
float sse_read(float* arr, uint64_t arr_length, uint64_t iterations) {
    float sum = 0;
    float iterSum = 0;
    // zero two sums
    __m128 s1 = _mm_setzero_ps();
    __m128 s2 = _mm_setzero_ps();
    __m128 s3 = _mm_loadu_ps(arr);
    __m128 s4 = _mm_loadu_ps(arr);
    __m128 s5 = _mm_setzero_ps();
    __m128 s6 = _mm_setzero_ps();
    __m128 s7 = _mm_loadu_ps(arr);
    __m128 s8 = _mm_loadu_ps(arr);
    __m128 zero = _mm_setzero_ps();

    uint64_t iter_idx = 0, i = 0;
    while (iter_idx < iterations) {
        __m128 e1 = _mm_loadu_ps(arr + i);
        __m128 e2 = _mm_loadu_ps(arr + i + 4);
        __m128 e3 = _mm_loadu_ps(arr + i + 8);
        __m128 e4 = _mm_loadu_ps(arr + i + 12);
        __m128 e5 = _mm_loadu_ps(arr + i + 16);
        __m128 e6 = _mm_loadu_ps(arr + i + 20);
        __m128 e7 = _mm_loadu_ps(arr + i + 24);
        __m128 e8 = _mm_loadu_ps(arr + i + 28);
        s1 = _mm_add_ps(s1, e1);
        s2 = _mm_add_ps(s2, e2);
        s3 = _mm_mul_ps(s3, e3);
        s4 = _mm_mul_ps(s4, e4);
        s5 = _mm_add_ps(s5, e5);
        s6 = _mm_add_ps(s6, e6);
        s7 = _mm_mul_ps(s7, e7);
        s8 = _mm_mul_ps(s8, e8);
        i += 32;
        if (i + 31 >= arr_length) i = 0;
        if (i == 0) iter_idx++;
    }

    iterSum = _mm_cvtss_f32(s1) + _mm_cvtss_f32(s2) + _mm_cvtss_f32(s3) + _mm_cvtss_f32(s4) +
        _mm_cvtss_f32(s5) + _mm_cvtss_f32(s6) + _mm_cvtss_f32(s7) + _mm_cvtss_f32(s8);
    sum = iterSum;
    return sum;
}

#ifdef _WIN64
float avx_read(float* arr, uint64_t arr_length, uint64_t iterations) {
    float sum = 0;
    float iterSum = 0;
    __m256 s1 = _mm256_setzero_ps();
    __m256 s2 = _mm256_loadu_ps(arr);
    __m256 s3 = _mm256_setzero_ps();
    __m256 s4 = _mm256_loadu_ps(arr);
    __m256 s5 = _mm256_loadu_ps(arr);
    __m256 s6 = _mm256_loadu_ps(arr);
    __m256 s7 = _mm256_loadu_ps(arr);
    __m256 s8 = _mm256_loadu_ps(arr);
    uint64_t iter_idx = 0, i = 0;

    while (iter_idx < iterations) {
        __m256 e1 = _mm256_loadu_ps(arr + i);
        __m256 e2 = _mm256_loadu_ps(arr + i + 8);
        __m256 e3 = _mm256_loadu_ps(arr + i + 16);
        __m256 e4 = _mm256_loadu_ps(arr + i + 24);
        __m256 e5 = _mm256_loadu_ps(arr + i + 32);
        __m256 e6 = _mm256_loadu_ps(arr + i + 40);
        __m256 e7 = _mm256_loadu_ps(arr + i + 48);
        __m256 e8 = _mm256_loadu_ps(arr + i + 56);
        s1 = _mm256_add_ps(s1, e1);
        s2 = _mm256_mul_ps(s2, e2);
        s3 = _mm256_add_ps(s3, e3);
        s4 = _mm256_mul_ps(s4, e4);
        s5 = _mm256_mul_ps(s5, e5);
        s6 = _mm256_mul_ps(s6, e6);
        s7 = _mm256_mul_ps(s7, e7);
        s8 = _mm256_mul_ps(s8, e8);
        i += 64;
        if (i + 63 >= arr_length) i = 0;
        if (i == 0) iter_idx++;
    }

    // sink the result somehow
    iterSum = _mm256_cvtss_f32(s1) + _mm256_cvtss_f32(s2) + _mm256_cvtss_f32(s3) + _mm256_cvtss_f32(s4) +
        _mm256_cvtss_f32(s5) + _mm256_cvtss_f32(s6) + _mm256_cvtss_f32(s7) + _mm256_cvtss_f32(s8);
    sum = iterSum;

    return sum;
}
#endif

DWORD WINAPI ReadBandwidthTestThread(LPVOID param) {
    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;
    float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations);
    if (sum == 0) printf("woohoo\n");
    return 0;
}

void PrintNumaInfo() {
    ULONG highestNumaNode;
    DWORD nProcs;
    SYSTEM_INFO SystemInfo;
    GetSystemInfo(&SystemInfo);
    nProcs = SystemInfo.dwNumberOfProcessors;
    if (!GetNumaHighestNodeNumber(&highestNumaNode)) {
        fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError());
        return;
    }

    printf("%d processors, highest NUMA node is %lu\n", nProcs, highestNumaNode);

    if (highestNumaNode == 0)
    {
        return;
    }

    for (int procIdx = 0; procIdx < nProcs; procIdx++)
    {
        unsigned char node;
        GetNumaProcessorNode(procIdx, &node);
        printf("Processor %d is on node %d\n", procIdx, node);
    }

    for (char nodeIdx = 0; nodeIdx <= highestNumaNode; nodeIdx++) {
        ULONGLONG mask;
        GetNumaNodeProcessorMask(nodeIdx, &mask);
        printf("Node %d: %llx\n", nodeIdx, mask);
    }
}

char GetStripeNode(uint64_t threadIdx) {
    ULONG highestNumaNode;
    if (!GetNumaHighestNodeNumber(&highestNumaNode)) {
        fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError());
        return 0;
    }

    return threadIdx % highestNumaNode;
}

char GetSeqNode(uint64_t threadIdx) {
    SYSTEM_INFO SystemInfo;
    GetSystemInfo(&SystemInfo);
    unsigned int clippedThreadIdx = threadIdx % SystemInfo.dwNumberOfProcessors;
    unsigned char node;
    GetNumaProcessorNode(clippedThreadIdx, &node);
    return node;
}


================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.6.33815.320
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryBandwidth", "MemoryBandwidth.vcxproj", "{E968D202-64A2-43A5-8BBD-D7D010D06564}"
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MixedMemoryBandwidthTest", "..\MixedMemoryBandwidthTest\MixedMemoryBandwidthTest.vcxproj", "{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.ActiveCfg = Debug|x64
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.Build.0 = Debug|x64
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.ActiveCfg = Debug|Win32
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.Build.0 = Debug|Win32
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.ActiveCfg = Release|x64
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.Build.0 = Release|x64
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.ActiveCfg = Release|Win32
		{E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.Build.0 = Release|Win32
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.ActiveCfg = Debug|x64
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.Build.0 = Debug|x64
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.ActiveCfg = Debug|Win32
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.Build.0 = Debug|Win32
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.ActiveCfg = Release|x64
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.Build.0 = Release|x64
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.ActiveCfg = Release|Win32
		{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {2EA86D6F-CEE0-40A9-B4DD-AC59CCAD358F}
	EndGlobalSection
EndGlobal


================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{e968d202-64a2-43a5-8bbd-d7d010d06564}</ProjectGuid>
    <RootNamespace>MemoryBandwidth</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="MemoryBandwidth.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryBandwidthFunctions.asm">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 MemoryBandwidthFunctions.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Running NASM</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MemoryBandwidthFunctions.obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -f win64 MemoryBandwidthFunctions.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Running NASM</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MemoryBandwidthFunctions.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryBandwidthFunctions32.asm">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">nasm -f win32 MemoryBandwidthFunctions32.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Running NASM, targeting 32-bit</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">MemoryBandwidthFunctions32.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="MemoryBandwidth.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryBandwidthFunctions.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
    <CustomBuild Include="MemoryBandwidthFunctions32.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
</Project>

================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions.asm
================================================
section .text

bits 64

global sse_asm_read
global sse_asm_copy
global sse_asm_write
global sse_asm_ntwrite
global sse_asm_add
global avx_asm_read
global avx_asm_write
global avx_asm_ntwrite
global avx_asm_copy
global avx_asm_cflip
global avx_asm_add
global avx512_asm_read
global clzero_asm_write

global repmovsb_copy
global repstosb_write

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations, r9 = start index
; return something in xmm0
avx_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx_asm_read_pass_loop:
  ; xmm0 to 5 are considered volatile
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  add rsi, 64
  add rdi, r15
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_test_iteration_count:
  cmp r9, rsi
  jnz avx_asm_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_write:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm0, [rcx]
avx_asm_write_pass_loop:
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm0
  vmovaps [rdi + 64], ymm0
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm0
  vmovaps [rdi + 192], ymm0
  vmovaps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm0
  vmovaps [rdi + 64], ymm0
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm0
  vmovaps [rdi + 192], ymm0
  vmovaps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_write_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_write_iteration_count:
  cmp r9, rsi
  jnz avx_asm_write_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_write_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_ntwrite:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm0, [rcx]
avx_asm_ntwrite_pass_loop:
  vmovntps [rdi], ymm0
  vmovntps [rdi + 32], ymm0
  vmovntps [rdi + 64], ymm0
  vmovntps [rdi + 96], ymm0
  vmovntps [rdi + 128], ymm0
  vmovntps [rdi + 160], ymm0
  vmovntps [rdi + 192], ymm0
  vmovntps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  vmovntps [rdi], ymm0
  vmovntps [rdi + 32], ymm0
  vmovntps [rdi + 64], ymm0
  vmovntps [rdi + 96], ymm0
  vmovntps [rdi + 128], ymm0
  vmovntps [rdi + 160], ymm0
  vmovntps [rdi + 192], ymm0
  vmovntps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_ntwrite_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_ntwrite_iteration_count:
  cmp r9, rsi
  jnz avx_asm_ntwrite_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_ntwrite_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

; rcx = ptr to arr
; rdx = arr_length
; r8 = iterations
avx_asm_copy:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  push r13
  xor rsi, rsi
  mov r9, rdx
  shr r9, 1    ; start destination at array + length / 2
  mov r15, 256 ; load in blocks of 128 bytes
  mov r13, r9
  sub r13, 64
  lea rdi, [rcx + rsi * 4]
  lea r14, [rcx + r9 * 4]
avx_copy_pass_loop:
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm4, [rdi + 128]
  vmovaps ymm5, [rdi + 160]
  vmovaps ymm6, [rdi + 192]
  vmovaps ymm7, [rdi + 224]
  vmovaps [r14], ymm0
  vmovaps [r14 + 32], ymm1
  vmovaps [r14 + 64], ymm2
  vmovaps [r14 + 96], ymm3
  vmovaps [r14 + 128], ymm4
  vmovaps [r14 + 160], ymm5
  vmovaps [r14 + 192], ymm6
  vmovaps [r14 + 224], ymm7
  add rsi, 64
  add rdi, r15  ; increment src/dst pointers
  add r14, r15
  cmp r13, rsi  ; end location is at half
  jge avx_copy_pass_loop
  xor rsi, rsi
  lea rdi, [rcx + rsi * 4] ; back to start
  lea r14, [rcx + r9 * 4]
  dec r8                  ; decrement iteration counter
  jnz avx_copy_pass_loop
  pop r13
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; changes the ordering of vector sized elements within a cacheline
 avx_asm_cflip:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break. 128 elements per iteration
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx_asm_cflip_pass_loop:
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 64], ymm1
  vmovaps [rdi + 32], ymm2
  vmovaps [rdi], ymm3
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  vmovaps [rdi + 224], ymm0
  vmovaps [rdi + 192], ymm1
  vmovaps [rdi + 160], ymm2
  vmovaps [rdi + 128], ymm3
  add rsi, 64
  add rdi, r15
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 64], ymm1
  vmovaps [rdi + 32], ymm2
  vmovaps [rdi], ymm3
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  vmovaps [rdi + 224], ymm0
  vmovaps [rdi + 192], ymm1
  vmovaps [rdi + 160], ymm2
  vmovaps [rdi + 128], ymm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_cflip_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_cflip_iteration_count:
  cmp r9, rsi
  jnz avx_asm_cflip_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jnz avx_asm_cflip_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_add:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm4, [rdi]
avx_asm_add_pass_loop:
  ; xmm0 to 5 are considered volatile
  vaddps ymm0, ymm4, [rdi]
  vaddps ymm1, ymm4, [rdi + 32]
  vaddps ymm2, ymm4, [rdi + 64]
  vaddps ymm3, ymm4, [rdi + 96]
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm1
  vmovaps [rdi + 64], ymm2
  vmovaps [rdi + 96], ymm3
  vaddps ymm0, ymm4, [rdi + 128]
  vaddps ymm1, ymm4, [rdi + 160]
  vaddps ymm2, ymm4, [rdi + 192]
  vaddps ymm3, ymm4, [rdi + 224]
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm1
  vmovaps [rdi + 192], ymm2
  vmovaps [rdi + 224], ymm3

  add rsi, 64
  add rdi, r15
  vaddps ymm0, ymm4, [rdi]
  vaddps ymm1, ymm4, [rdi + 32]
  vaddps ymm2, ymm4, [rdi + 64]
  vaddps ymm3, ymm4, [rdi + 96]
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm1
  vmovaps [rdi + 64], ymm2
  vmovaps [rdi + 96], ymm3
  vaddps ymm0, ymm4, [rdi + 128]
  vaddps ymm1, ymm4, [rdi + 160]
  vaddps ymm2, ymm4, [rdi + 192]
  vaddps ymm3, ymm4, [rdi + 224]
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm1
  vmovaps [rdi + 192], ymm2
  vmovaps [rdi + 224], ymm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_add_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_add_iteration_count:
  cmp r9, rsi
  jnz avx_asm_add_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jnz avx_asm_add_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx512_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx512_asm_read_pass_loop:
  vmovaps zmm0, [rdi]
  vmovaps zmm1, [rdi + 64]
  vmovaps zmm2, [rdi + 128]
  vmovaps zmm3, [rdi + 192]
  add rsi, 64
  add rdi, r15
  vmovaps zmm0, [rdi]
  vmovaps zmm1, [rdi + 64]
  vmovaps zmm2, [rdi + 128]
  vmovaps zmm3, [rdi + 192]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx512_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx512_test_iteration_count:
  cmp r9, rsi
  jnz avx512_asm_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx512_asm_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

clzero_asm_write:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
clzero_asm_write_pass_loop:
  mov rax, rdi
  clzero
  add rax, 64
  clzero
  add rax, 64
  clzero
  add rax, 64
  clzero
  add rsi, 64
  add rdi, r15
  mov rax, rdi
  clzero
  add rax, 64
  clzero
  add rax, 64
  clzero
  add rax, 64
  clzero
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge clzero_asm_write_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
clzero_asm_write_iteration_count:
  cmp r9, rsi
  jnz clzero_asm_write_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  sfence
  jnz clzero_asm_write_pass_loop
  mov rax, 1
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
sse_read_pass_loop:
  ; xmm0 to 5 are considered volatile
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm0, [rdi + 64]
  movaps xmm1, [rdi + 80]
  movaps xmm2, [rdi + 96]
  movaps xmm3, [rdi + 112]
  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm0, [rdi + 192]
  movaps xmm2, [rdi + 208]
  movaps xmm2, [rdi + 224]
  movaps xmm2, [rdi + 240]
  add rsi, 64
  add rdi, r15
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm0, [rdi + 64]
  movaps xmm1, [rdi + 80]
  movaps xmm2, [rdi + 96]
  movaps xmm3, [rdi + 112]
  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm0, [rdi + 192]
  movaps xmm2, [rdi + 208]
  movaps xmm2, [rdi + 224]
  movaps xmm2, [rdi + 240]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_test_iteration_count:
  cmp r9, rsi
  jnz sse_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz sse_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
sse_asm_write:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm0, [rdi]
sse_write_pass_loop:
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm0
  movaps [rdi + 32], xmm0
  movaps [rdi + 48], xmm0
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm0
  movaps [rdi + 96], xmm0
  movaps [rdi + 112], xmm0
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm0
  movaps [rdi + 160], xmm0
  movaps [rdi + 176], xmm0
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm0
  movaps [rdi + 224], xmm0
  movaps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm0
  movaps [rdi + 32], xmm0
  movaps [rdi + 48], xmm0
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm0
  movaps [rdi + 96], xmm0
  movaps [rdi + 112], xmm0
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm0
  movaps [rdi + 160], xmm0
  movaps [rdi + 176], xmm0
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm0
  movaps [rdi + 224], xmm0
  movaps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_write_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_write_iteration_count:
  cmp r9, rsi
  jnz sse_write_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jg sse_write_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_ntwrite:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm0, [rdi]
sse_ntwrite_pass_loop:
  movntps [rdi], xmm0
  movntps [rdi + 16], xmm0
  movntps [rdi + 32], xmm0
  movntps [rdi + 48], xmm0
  movntps [rdi + 64], xmm0
  movntps [rdi + 80], xmm0
  movntps [rdi + 96], xmm0
  movntps [rdi + 112], xmm0
  movntps [rdi + 128], xmm0
  movntps [rdi + 144], xmm0
  movntps [rdi + 160], xmm0
  movntps [rdi + 176], xmm0
  movntps [rdi + 192], xmm0
  movntps [rdi + 208], xmm0
  movntps [rdi + 224], xmm0
  movntps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  movntps [rdi], xmm0
  movntps [rdi + 16], xmm0
  movntps [rdi + 32], xmm0
  movntps [rdi + 48], xmm0
  movntps [rdi + 64], xmm0
  movntps [rdi + 80], xmm0
  movntps [rdi + 96], xmm0
  movntps [rdi + 112], xmm0
  movntps [rdi + 128], xmm0
  movntps [rdi + 144], xmm0
  movntps [rdi + 160], xmm0
  movntps [rdi + 176], xmm0
  movntps [rdi + 192], xmm0
  movntps [rdi + 208], xmm0
  movntps [rdi + 224], xmm0
  movntps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_ntwrite_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_ntwrite_iteration_count:
  cmp r9, rsi
  jnz sse_ntwrite_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jg sse_ntwrite_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; rcx = ptr to arr
; rdx = arr_length
; r8 = iterations
sse_asm_copy:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  push r13
  xor rsi, rsi
  mov r9, rdx
  shr r9, 1    ; start destination at array + length / 2
  mov r15, 256 ; load in blocks of 128 bytes
  mov r13, r9
  sub r13, 64
  lea rdi, [rcx + rsi * 4]
  lea r14, [rcx + r9 * 4]
sse_copy_pass_loop:
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm4, [rdi + 64]
  movaps xmm5, [rdi + 80]
  movaps xmm6, [rdi + 96]
  movaps xmm7, [rdi + 112]
  movaps [r14], xmm0
  movaps [r14 + 16], xmm1
  movaps [r14 + 32], xmm2
  movaps [r14 + 48], xmm3
  movaps [r14 + 64], xmm4
  movaps [r14 + 80], xmm5
  movaps [r14 + 96], xmm6
  movaps [r14 + 112], xmm7

  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm4, [rdi + 192]
  movaps xmm5, [rdi + 208]
  movaps xmm6, [rdi + 224]
  movaps xmm7, [rdi + 240]
  movaps [r14 + 128], xmm0
  movaps [r14 + 144], xmm1
  movaps [r14 + 160], xmm2
  movaps [r14 + 176], xmm3
  movaps [r14 + 192], xmm4
  movaps [r14 + 208], xmm5
  movaps [r14 + 224], xmm6
  movaps [r14 + 240], xmm7

  add rsi, 64
  add rdi, r15  ; increment src/dst pointers
  add r14, r15
  cmp r13, rsi  ; end location is at half
  jge sse_copy_pass_loop
  xor rsi, rsi
  lea rdi, [rcx + rsi * 4] ; back to start
  lea r14, [rcx + r9 * 4]
  dec r8                  ; decrement iteration counter
  jnz sse_copy_pass_loop
  pop r13
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_add:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm5, [rdi]
sse_add_pass_loop:
  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi]
  addps xmm1, [rdi + 16]
  addps xmm2, [rdi + 32]
  addps xmm3, [rdi + 48]
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm1
  movaps [rdi + 32], xmm2
  movaps [rdi + 48], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 64]
  addps xmm1, [rdi + 80]
  addps xmm2, [rdi + 96]
  addps xmm3, [rdi + 112]
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm1
  movaps [rdi + 96], xmm2
  movaps [rdi + 112], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 128]
  addps xmm1, [rdi + 144]
  addps xmm2, [rdi + 160]
  addps xmm3, [rdi + 176]
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm1
  movaps [rdi + 160], xmm2
  movaps [rdi + 176], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 192]
  addps xmm1, [rdi + 208]
  addps xmm2, [rdi + 224]
  addps xmm3, [rdi + 240]
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm1
  movaps [rdi + 224], xmm2
  movaps [rdi + 240], xmm3

  add rsi, 64
  add rdi, r15
  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi]
  addps xmm1, [rdi + 16]
  addps xmm2, [rdi + 32]
  addps xmm3, [rdi + 48]
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm1
  movaps [rdi + 32], xmm2
  movaps [rdi + 48], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 64]
  addps xmm1, [rdi + 80]
  addps xmm2, [rdi + 96]
  addps xmm3, [rdi + 112]
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm1
  movaps [rdi + 96], xmm2
  movaps [rdi + 112], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 128]
  addps xmm1, [rdi + 144]
  addps xmm2, [rdi + 160]
  addps xmm3, [rdi + 176]
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm1
  movaps [rdi + 160], xmm2
  movaps [rdi + 176], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 192]
  addps xmm1, [rdi + 208]
  addps xmm2, [rdi + 224]
  addps xmm3, [rdi + 240]
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm1
  movaps [rdi + 224], xmm2
  movaps [rdi + 240], xmm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_add_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_add_iteration_count:
  cmp r9, rsi
  jnz sse_add_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jg sse_add_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
repmovsb_copy:
  push r15
  push r14
  push r13
  push r12
  push rsi
  push rdi
  push rax
  cld
  ; source = rsi, destination = rdi, count (in bytes) = rcx
  mov rsi, rcx  ; set source
  shr rdx, 1    ; set destination = source + (size / 2)
  mov rdi, rcx
  add rdi, rdx
  mov rcx, rdx  ; set count = (size / 2) * (4 bytes per fp32 element)
  shl rcx, 2
  mov r12, rsi
  mov r13, rdi
  mov r14, rcx
repmovsb_copy_pass_loop:
  mov rsi, r12
  mov rdi, r13
  mov rcx, r14
  rep movsb
  dec r8
  jnz repmovsb_copy_pass_loop
  movss xmm0, [r12]
  pop rax
  pop rdi
  pop rsi
  pop r12
  pop r13
  pop r14
  pop r15
  ret

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
repstosb_write:
  push r15
  push r14
  push r13
  push r12
  push rsi
  push rdi
  push rax
  cld
  ; source = value in al, destination = rdi, count (in bytes) = rcx
  mov al, 1  ; set source
  mov r13, rcx  ; destination = start of arr
  mov r14, rdx  
  shl r14, 2    ; count = (nr of FP32 elements) * 4
repstosb_write_pass_loop:
  mov rdi, r13
  mov rcx, r14
  rep stosb
  dec r8
  jnz repstosb_write_pass_loop
  movss xmm0, [r13]
  pop rax
  pop rdi
  pop rsi
  pop r12
  pop r13
  pop r14
  pop r15
  ret

================================================
FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions32.asm
================================================
section .text

bits 32

global @sse_asm_read32@12
global sse_asm_read32
global @mmx_asm_read32@12
global mmx_asm_read32
global @scalar_asm_read32@12
global scalar_asm_read32
global @dummy@12

@dummy@12:
  mov eax, [esp]
  mov [esp + 4], eax
  add esp, 4
  ret

; ecx = ptr to float array
; edx = arr length
; [esp + 4] = iterations, put this into eax
sse_asm_read32:
@sse_asm_read32@12:
  mov eax, [esp + 4]
  push ecx
  push edx
  push esi
  push edi
  sub edx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor esi, esi ; index into array = 0
  lea edi, [ecx + esi * 4]
sse_read32_pass_loop:
  movaps xmm0, [edi]
  movaps xmm1, [edi + 16]
  movaps xmm2, [edi + 32]
  movaps xmm3, [edi + 48]
  movaps xmm0, [edi + 64]
  movaps xmm1, [edi + 80]
  movaps xmm2, [edi + 96]
  movaps xmm3, [edi + 112]
  movaps xmm0, [edi + 128]
  movaps xmm1, [edi + 144]
  movaps xmm2, [edi + 160]
  movaps xmm3, [edi + 176]
  movaps xmm0, [edi + 192]
  movaps xmm2, [edi + 208]
  movaps xmm2, [edi + 224]
  movaps xmm2, [edi + 240]
  add esi, 64
  add edi, 256
  movaps xmm0, [edi]
  movaps xmm1, [edi + 16]
  movaps xmm2, [edi + 32]
  movaps xmm3, [edi + 48]
  movaps xmm0, [edi + 64]
  movaps xmm1, [edi + 80]
  movaps xmm2, [edi + 96]
  movaps xmm3, [edi + 112]
  movaps xmm0, [edi + 128]
  movaps xmm1, [edi + 144]
  movaps xmm2, [edi + 160]
  movaps xmm3, [edi + 176]
  movaps xmm0, [edi + 192]
  movaps xmm2, [edi + 208]
  movaps xmm2, [edi + 224]
  movaps xmm2, [edi + 240]
  add esi, 64
  add edi, 256
  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements
  jge sse_read32_pass_loop

  ; zero the index, get back to start, decrement iteration count
  xor esi, esi
  lea edi, [ecx + esi * 4]
  dec eax
  jnz sse_read32_pass_loop
  pop edi
  pop esi
  pop edx
  pop ecx

  ; I don't understand this calling convention
  ; nothing I looked up explains it
  mov eax, [esp]
  mov [esp + 4], eax
  add esp, 4
  ret

mmx_asm_read32:
@mmx_asm_read32@12:
  mov eax, [esp + 4]
  push ecx
  push edx
  push esi
  push edi
  sub edx, 64 ; last iteration: rsi == rdx. rsi > rdx = break
  xor esi, esi ; index into array = 0
  lea edi, [ecx + esi * 4]
mmx_read32_pass_loop:
  movq mm0, [edi]
  movq mm1, [edi + 8]
  movq mm2, [edi + 16]
  movq mm3, [edi + 24]
  movq mm4, [edi + 32]
  movq mm5, [edi + 40]
  movq mm6, [edi + 48]
  movq mm7, [edi + 56]

  movq mm0, [edi + 64]
  movq mm1, [edi + 72]
  movq mm2, [edi + 80]
  movq mm3, [edi + 88]
  movq mm4, [edi + 96]
  movq mm5, [edi + 104]
  movq mm6, [edi + 112]
  movq mm7, [edi + 120]

  movq mm0, [edi + 128]
  movq mm1, [edi + 136]
  movq mm2, [edi + 144]
  movq mm3, [edi + 152]
  movq mm4, [edi + 160]
  movq mm5, [edi + 168]
  movq mm6, [edi + 176]
  movq mm7, [edi + 184]

  movq mm0, [edi + 192]
  movq mm1, [edi + 200]
  movq mm2, [edi + 208]
  movq mm3, [edi + 216]
  movq mm4, [edi + 224]
  movq mm5, [edi + 232]
  movq mm6, [edi + 240]
  movq mm7, [edi + 248]
  add esi, 64
  add edi, 256
  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements
  jge mmx_read32_pass_loop

  ; zero the index, get back to start, decrement iteration count
  xor esi, esi
  lea edi, [ecx + esi * 4]
  dec eax
  jnz mmx_read32_pass_loop
  pop edi
  pop esi
  pop edx
  pop ecx

  mov eax, [esp]
  mov [esp + 4], eax
  add esp, 4
  fld1
  ret

; [esp + 4] = iterations
scalar_asm_read32:
@scalar_asm_read32@12:
  push ebx
  push ecx
  push edx
  push esi
  push edi
  sub edx, 32 ; last iteration: rsi == rdx. rsi > rdx = break
  xor esi, esi ; index into array = 0
  lea edi, [ecx + esi * 4]
scalar_read32_pass_loop:
  mov eax, [edi]
  mov ebx, [edi + 4]
  mov eax, [edi + 8]
  mov ebx, [edi + 12]
  mov eax, [edi + 16]
  mov ebx, [edi + 20]
  mov eax, [edi + 24]
  mov ebx, [edi + 28]
  mov eax, [edi + 32]
  mov ebx, [edi + 36]
  mov eax, [edi + 40]
  mov ebx, [edi + 44]
  mov eax, [edi + 48]
  mov ebx, [edi + 52]
  mov eax, [edi + 56]
  mov ebx, [edi + 60]

  mov eax, [edi + 64]
  mov ebx, [edi + 68]
  mov eax, [edi + 72]
  mov ebx, [edi + 76]
  mov eax, [edi + 80]
  mov ebx, [edi + 84]
  mov eax, [edi + 88]
  mov ebx, [edi + 92]
  mov eax, [edi + 96]
  mov ebx, [edi + 100]
  mov eax, [edi + 104]
  mov ebx, [edi + 108]
  mov eax, [edi + 112]
  mov ebx, [edi + 116]
  mov eax, [edi + 120]
  mov ebx, [edi + 124]

  add esi, 32
  add edi, 128
  cmp edx, esi              ; bounds check, expects size to be multiple of 64 elements
  jge scalar_read32_pass_loop

  ; zero the index, get back to start, decrement iteration count
  xor esi, esi
  lea edi, [ecx + esi * 4]
  dec dword [esp + 24]
  jnz scalar_read32_pass_loop
  pop edi
  pop esi
  pop edx
  pop ecx
  pop ebx

  mov eax, [esp]
  mov [esp + 4], eax
  add esp, 4
  fld1
  ret


================================================
FILE: MemoryBandwidth/MemoryBandwidth.c
================================================
// MemoryBandwidth.c : Version for linux (x86 and ARM)
// Mostly the same as the x86-only VS version, but a bit more manual

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#include <sys/time.h>
#include <unistd.h>
#include <sched.h>
#include <pthread.h>
#include <sched.h>
#include <math.h>
#include <errno.h>

#ifndef __MINGW32__
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h> 
#include "../Common/perfmon.h"
#endif 

#ifdef NUMA
#include <sys/sysinfo.h>
#include <numa.h>
#endif

#ifndef gettid
#define gettid() ((pid_t)syscall(SYS_gettid))
#endif

#define HUGEPAGE_HACK 1
#undef HUGEPAGE_HACK

#pragma GCC diagnostic ignored "-Wattributes"

int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 400, 448, 512, 600, 768, 1024, 1536, 2048, 2560,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 40960, 51200, 61440, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };

typedef struct BandwidthTestThreadData {
    uint64_t iterations;
    uint64_t arr_length;
    uint64_t start;
    float* arr;
    float bw; // written to by the thread
    #ifdef NUMA
    cpu_set_t cpuset; // if numa set, will set affinity
    #endif
} BandwidthTestThreadData;

float MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shared, int nopBytes, int coreNode, int memNode);

#ifdef __x86_64
#include <cpuid.h>
float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute((ms_abi));
extern float sse_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float sse_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float sse_ntwrite(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float avx512_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float avx512_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float avx512_copy(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float avx512_add(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float repmovsb_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float repmovsd_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float repstosb_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float repstosd_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
float (*bw_func)(float*, uint64_t, uint64_t, uint64_t start) __attribute__((ms_abi));
#else
float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
float (*bw_func)(float*, uint64_t, uint64_t, uint64_t start);
#endif

#ifdef __x86_64
extern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
extern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi));
#else
extern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
extern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
extern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
extern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
extern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start);
#endif

#ifdef __aarch64__
extern void flush_icache(void *arr, uint64_t length);
#endif

#ifdef __x86_64
__attribute((ms_abi)) float instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {
#else
float instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) { 
#endif
    void (*nopfunc)(uint64_t) __attribute((ms_abi)) = (__attribute((ms_abi)) void(*)(uint64_t))arr;
    for (int iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);
    return 1.1f;
}

void FillInstructionArray(uint64_t *nops, uint64_t sizeKb, int nopSize, int branchInterval); 
uint64_t GetIterationCount(uint64_t testSize, uint64_t threads);
void *ReadBandwidthTestThread(void *param);
void *allocate_memory(size_t bytes, unsigned int threadOffset);
uint64_t gbToTransfer = 512;
int branchInterval = 0; 

cpu_set_t global_cpuset;
int hardaffinity = 0;

#ifdef NUMA
#define NUMA_STRIPE 1
#define NUMA_SEQ 2
#define NUMA_CROSSNODE 3
#define NUMA_AUTO 4
#define NUMA_DOUBLE_CROSSNODE 5
int numa = 0;
#endif

int pmon = 0;

int main(int argc, char *argv[]) {
    int threads = 1;
    int cpuid_data[4];
    int shared = 1;
    int sleepTime = 0;
    int methodSet = 0, nopBytes = 0, testBankConflict = 0;
    int testBankConflict128 = 0;
    int singleSize = 0, autothreads = 0;
    int testSizeCount = sizeof(default_test_sizes) / sizeof(int);

#ifdef __x86_64
    int sseSupported = 0, avxSupported = 0, avx512Supported = 0;
    sseSupported = __builtin_cpu_supports("sse");
    if (sseSupported) fprintf(stderr, "SSE supported\n");
    avxSupported = __builtin_cpu_supports("avx");
    if (avxSupported) fprintf(stderr, "AVX supported\n");
    // gcc has no __builtin_cpu_supports for avx512, so check by hand.
    // eax = 7 -> extended features, bit 16 of ebx = avx512f
    uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;
    __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);
    if (cpuidEbx & (1UL << 16)) {
        fprintf(stderr, "AVX512 supported\n");
        avx512Supported = 1;
    }
#endif

    bw_func = asm_read;
    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char *arg = argv[argIdx] + 1;
            if (strncmp(arg, "threads", 7) == 0) {
                argIdx++;
                threads = atoi(argv[argIdx]);
                fprintf(stderr, "Using %d threads\n", threads);
            } else if (strncmp(arg, "shared", 6) == 0) {
                shared = 1;
                fprintf(stderr, "Using shared array\n");
            } else if (strncmp(arg, "hardaffinity", 12) == 0) {
                hardaffinity = 1;
                CPU_ZERO(&global_cpuset);
                CPU_SET(0, &global_cpuset);
                CPU_SET(1, &global_cpuset);
                sched_setaffinity(gettid(), sizeof(cpu_set_t), &global_cpuset);
                fprintf(stderr, "hardaffinity 0,1\n");
            }
            else if (strncmp(arg, "sleep", 5) == 0) {
                argIdx++;
                sleepTime = atoi(argv[argIdx]);
                fprintf(stderr, "Sleeping for %d second between tests\n", sleepTime);
            } else if (strncmp(arg, "private", 7) == 0) {
                shared = 0;
                fprintf(stderr, "Using private array for each thread\n");
            } else if (strncmp(arg, "branchinterval", 14) == 0) {
                argIdx++;
                branchInterval = atoi(argv[argIdx]);
                fprintf(stderr, "Will add a branch roughly every %d bytes\n", branchInterval * 8);
            } else if (strncmp(arg, "sizekb", 6) == 0) {
                argIdx++;
        singleSize = atoi(argv[argIdx]);
                fprintf(stderr, "Testing %d KB\n", singleSize);
            } else if (strncmp(arg, "data", 4) == 0) {
                argIdx++;
                gbToTransfer = atoi(argv[argIdx]);
                fprintf(stderr, "Base GB to transfer: %lu\n", gbToTransfer);
            }
            else if (strncmp(arg, "autothreads", 11) == 0) {
                argIdx++;
                autothreads = atoi(argv[argIdx]);
                fprintf(stderr, "Testing bw scaling up to %d threads\n", autothreads);
            }
#ifndef __MINGW32__
            else if (strncmp(arg, "pmon", 4) == 0) {
                pmon = 1;
                fprintf(stderr, "Using hardware performance monitoring\n");
            }
#endif
#ifdef NUMA
            else if (strncmp(arg, "numa", 4) == 0) {
                argIdx++;
                fprintf(stderr, "Attempting to be NUMA aware\n");
                if (strncmp(argv[argIdx], "crossnode", 4) == 0) {
                    fprintf(stderr, "Testing node to node bandwidth, 1 GB test size\n");
                    numa = NUMA_CROSSNODE;
                    singleSize = 1048576;
                } else if (strncmp(argv[argIdx], "seq", 3) == 0) {
                    fprintf(stderr, "Filling NUMA nodes one by one\n");
                    numa = NUMA_SEQ;
                } else if (strncmp(argv[argIdx], "stripe", 6) == 0) {
                    fprintf(stderr, "Striping threads across NUMA nodes\n");
                    numa = NUMA_STRIPE;
                } else if (strncmp(argv[argIdx], "doublecross", 10) == 0) {
                    fprintf(stderr, "Crossnode, with two nodes\n");
                    numa = NUMA_DOUBLE_CROSSNODE;
                }
            }
#endif
            else if (strncmp(arg, "method", 6) == 0) {
                methodSet = 1;
                argIdx++;
                if (strncmp(argv[argIdx], "scalar", 6) == 0) {
                    bw_func = scalar_read;
                    fprintf(stderr, "Using scalar C code\n");
                } else if (strncmp(argv[argIdx], "asm", 3) == 0) {
                    bw_func = asm_read;
                    fprintf(stderr, "Using ASM code (AVX or NEON)\n");
                } else if (strncmp(argv[argIdx], "write", 5) == 0) {
                    bw_func = asm_write;
                    fprintf(stderr, "Using ASM code (AVX or NEON), testing write bw instead of read\n");
                    #ifdef __x86_64
                    if (avx512Supported) {
                        fprintf(stderr, "Using AVX-512 because that's supported\n");
                        bw_func = avx512_write;
                    }
                    #endif
                } else if (strncmp(argv[argIdx], "copy", 4) == 0) {
                    bw_func = asm_copy;
                    fprintf(stderr, "Using ASM code (AVX or NEON), testing copy bw instead of read\n");
                    #ifdef __x86_64
                    if (avx512Supported) {
                        fprintf(stderr, "Using AVX-512 because that's supported\n");
                        bw_func = avx512_copy;
                    }
                    #endif
                } else if (strncmp(argv[argIdx], "cflip", 5) == 0) {
                    bw_func = asm_cflip;
                    fprintf(stderr, "Using ASM code (AVX or NEON), flipping order of elements within cacheline\n");
                } else if (strncmp(argv[argIdx], "add", 3) == 0) {
                    bw_func = asm_add;
                    fprintf(stderr, "Using ASM code (AVX or NEON), adding constant to array\n");
                    #ifdef __x86_64
                    if (avx512Supported) {
                        fprintf(stderr, "Using AVX-512 because that's supported\n");
                        bw_func = avx512_add;
                    }
                    #endif
                }

                else if (strncmp(argv[argIdx], "instr8", 6) == 0) {
                    nopBytes = 8;
                     bw_func = instr_read;
                    fprintf(stderr, "Testing instruction fetch bandwidth with 8 byte instructions.\n");
                } else if (strncmp(argv[argIdx], "instr4", 6) == 0) {
                    nopBytes = 4;
                     bw_func = instr_read;
                    fprintf(stderr, "Testing instruction fetch bandwidth with 4 byte instructions.\n");
                } else if (strncmp(argv[argIdx], "instr2", 6) == 0) {
                    nopBytes = 2;
                    bw_func = instr_read;
                    fprintf(stderr, "Testing instruction fetch bandwith with 2 byte instructions.\n");
                }
                #ifdef __x86_64
                else if (strncmp(argv[argIdx], "instrk8_4", 8) == 0) {
                    nopBytes = 3;
                    bw_func = instr_read;
                    fprintf(stderr, "Testing instruction bandwidth using 4B NOP encoding recommended in the Athlon optimization manual\n");
                }
                else if (strncmp(argv[argIdx], "instr_funcs", 11) == 0) {
                    nopBytes = -1;
                    bw_func = instr_read;
                    fprintf(stderr, "Testing instruction bandwidth with call to function/return blocks\n");
                } 
                else if (strncmp(argv[argIdx], "avx512", 6) == 0) {
                    bw_func = avx512_read;
                    fprintf(stderr, "Using ASM code, AVX512\n");
                }
                else if (strncmp(argv[argIdx], "sse_write", 9) == 0) {
                    bw_func = sse_write;
                    fprintf(stderr, "Using SSE to test write bandwidth\n");
                }
                else if (strncmp(argv[argIdx], "sse_ntwrite", 11) == 0) {
                    bw_func = sse_ntwrite;
                    fprintf(stderr, "Using SSE NT writes to test write bandwidth\n");
                } 
                else if (strncmp(argv[argIdx], "sse", 3) == 0) {
                    bw_func = sse_read;
                    fprintf(stderr, "Using ASM code, SSE\n");
                }
                else if (strncmp(argv[argIdx], "avx", 3) == 0) {
                    bw_func = asm_read;
                    fprintf(stderr, "Using ASM code, AVX\n");
                } 
                else if (strncmp(argv[argIdx], "repmovsb", 8) == 0) {
                    bw_func = repmovsb_copy;
                    fprintf(stderr, "Using REP MOVSB to copy\n");
                }
                else if (strncmp(argv[argIdx], "repmovsd", 8) == 0) {
                    bw_func = repmovsd_copy;
                    fprintf(stderr, "Using REP MOVSD to copy\n");
                }
                else if (strncmp(argv[argIdx], "repstosb", 9) == 0) {
                    bw_func = repstosb_write;
                    fprintf(stderr, "Using REP STOSB to write\n");
                } 
                else if (strncmp(argv[argIdx], "repstosd", 9) == 0) {
                    bw_func = repstosd_write;
                    fprintf(stderr, "Using REP STOSD to write\n");
                }  
                #endif
        
            }
        } else {
            fprintf(stderr, "Expected - parameter\n");
            fprintf(stderr, "Usage: [-threads <thread count>] [-private] [-method <scalar/asm/avx512>] [-sleep <time in seconds>] [-sizekb <single test size>]\n");
        }
    }

#ifdef __x86_64
    // if no method was specified, attempt to pick the best one for x86
    // for aarch64 we'll just use NEON because SVE basically doesn't exist
    if (!methodSet) {
        bw_func = scalar_read;
        if (sseSupported) {
            bw_func = sse_read;
        }

        if (avxSupported) {
            bw_func = asm_read;
        }


        if (avx512Supported) {
            bw_func = avx512_read;
        }
    }
#endif

    if (autothreads > 0) {
        float *threadResults = (float *)malloc(sizeof(float) * autothreads * testSizeCount);
        printf("Auto threads mode, up to %d threads\n", autothreads);
        for (int threadIdx = 1; threadIdx <= autothreads; threadIdx++) {
            if (singleSize != 0) {
                threadResults[threadIdx - 1] = MeasureBw(singleSize, GetIterationCount(singleSize, threadIdx), threadIdx, shared, nopBytes, 0, 0);
                fprintf(stderr, "%d threads: %f GB/s\n", threadIdx, threadResults[threadIdx - 1]);
            } else {
                for (int i = 0; i < testSizeCount; i++) {
                    int currentTestSize = default_test_sizes[i];
                    //fprintf(stderr, "Testing size %d\n", currentTestSize);
                    threadResults[(threadIdx - 1) * testSizeCount + i] = MeasureBw(currentTestSize, GetIterationCount(currentTestSize, threadIdx), threadIdx, shared, nopBytes, 0, 0);
                    fprintf(stderr, "%d threads, %d KB total: %f GB/s\n", threadIdx, currentTestSize, threadResults[(threadIdx - 1) * testSizeCount + i]);
                }
            }
        }

        if (singleSize != 0) {
            printf("Threads, BW (GB/s)\n");
            for (int i = 0;i < autothreads; i++) {
                printf("%d,%f\n", i + 1, threadResults[i]);
            }
        } else {
            printf("Test size down, threads across, value = GB/s\n");
            for (int sizeIdx = 0; sizeIdx < testSizeCount; sizeIdx++) {
                printf("%d", default_test_sizes[sizeIdx]);
                for (int threadIdx = 1; threadIdx <= autothreads; threadIdx++) {
                    printf(",%f", threadResults[(threadIdx - 1) * testSizeCount + sizeIdx]);
                }

                printf("\n");
            }
        }

        free(threadResults);
    } 
#ifdef NUMA
    else if (numa == NUMA_CROSSNODE) {
        if (numa_available() == -1) {
        fprintf(stderr, "NUMA is not available\n");
        return 0;
    }

        struct bitmask *nodeBitmask = numa_allocate_cpumask();
    int numaNodeCount = numa_max_node() + 1;
    fprintf(stderr, "System has %d NUMA nodes\n", numaNodeCount);
        float *crossnodeBandwidths = (float *)malloc(sizeof(float) * numaNodeCount * numaNodeCount);
    memset(crossnodeBandwidths, 0, sizeof(float) * numaNodeCount * numaNodeCount);
        for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {
            numa_node_to_cpus(cpuNode, nodeBitmask);
        int nodeCpuCount = numa_bitmask_weight(nodeBitmask);
        if (nodeCpuCount == 0) {
            fprintf(stderr, "Node %d has no cores\n", cpuNode);
            continue;
        }

        fprintf(stderr, "Node %d has %d cores\n", cpuNode, nodeCpuCount);
            for (int memNode = 0; memNode < numaNodeCount; memNode++) {
            fprintf(stderr, "Testing CPU node %d to mem node %d\n", cpuNode, memNode);
                crossnodeBandwidths[cpuNode * numaNodeCount + memNode] = 
                MeasureBw(singleSize, GetIterationCount(singleSize, nodeCpuCount), nodeCpuCount, shared, nopBytes, cpuNode, memNode);
            fprintf(stderr, "CPU node %d <- mem node %d: %f\n", cpuNode, memNode, crossnodeBandwidths[cpuNode * numaNodeCount + memNode]);
            }
        }

        for (int memNode = 0; memNode < numaNodeCount; memNode++) {
        printf(",%d", memNode);
    }

    printf("\n");
    for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {
        printf("%d", cpuNode);
        for (int memNode = 0; memNode < numaNodeCount; memNode++) {
            printf(",%f", crossnodeBandwidths[cpuNode * numaNodeCount + memNode]);
        }

        printf("\n");
    }

        numa_free_cpumask(nodeBitmask);
    free(crossnodeBandwidths);
    }
#endif
    else {
        printf("Using %d threads\n", threads);
        printf("Size (KB),Bandwidth (GB/s)");
#ifndef __MINGW32__
        if (pmon) {
            open_perf_monitoring();
            append_perf_header();
        }
#endif
        printf("\n");
        if (singleSize == 0)
        {
            for (int i = 0; i < testSizeCount; i++)
            {
                printf("%d,%f", default_test_sizes[i], MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, nopBytes, 0, 0));

#ifndef __MINGW32__
                if (pmon) append_perf_values();
#endif
                printf("\n");
                if (sleepTime > 0) sleep(sleepTime);
            }
        }
        else
        {
            printf("%d,%f", singleSize, MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, nopBytes, 0, 0));
            append_perf_values();
            printf("\n");
        }

        close_perf_monitoring();
    }

    return 0;
}

/// <summary>
/// Given test size in KB, return a good iteration count
/// </summary>
/// <param name="testSize">test size in KB</param>
/// <returns>Iterations per thread</returns>
uint64_t GetIterationCount(uint64_t testSize, uint64_t threads)
{
    int scaledGbToTransfer = gbToTransfer;
    if (testSize > 64) scaledGbToTransfer = gbToTransfer / 8;
    uint64_t iterations = scaledGbToTransfer * 1024 * 1024 / testSize;
    if (iterations % 2 != 0) iterations += 1;  // must be even

    if (iterations < 8) return 8; // set a minimum to reduce noise
    else return iterations;
}

// Writes 7B NOP + return
void WriteReturn8BBlock(char *dst) {
    dst[0] = 0xF;
    dst[1] = 0x1F;
    dst[2] = 0x80;
    for (int i = 0; i < 4; i++) dst[i + 3] = 0;
    dst[7] = 0xC3;
}

void FillInstructionArray(uint64_t *nops, uint64_t sizeKb, int nopSize, int branchInterval) {
#ifdef __x86_64
    char nop2b[8] = { 0x66, 0x90, 0x66, 0x90, 0x66, 0x90, 0x66, 0x90 };
    char nop2b_xor[8] = { 0x31, 0xc0, 0x31, 0xc0, 0x31, 0xc0, 0x31, 0xc0 };
    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };

    // zen/piledriver optimization manual uses this pattern
    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };

    // athlon64 (K8) optimization manual pattern
    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };
    char nop4b_with_branch[8] = { 0x0F, 0x1F, 0x40, 0x00, 0xEB, 0x00, 0x66, 0x90 };
#endif

#ifdef __aarch64__
    char nop4b[8] = { 0x1F, 0x20, 0x03, 0xD5, 0x1F, 0x20, 0x03, 0xD5 };

    // hack this to deal with graviton 1 / A72
    // nop + mov x0, 0
    char nop8b[8] = { 0x00, 0x00, 0x80, 0xD2, 0x00, 0x00, 0x80, 0xD2 }; 
    // mov x0, 0 + ldr x0, [sp] 
    char nop8b1[8] = { 0x00, 0x00, 0x80, 0xD2, 0x00, 0x00, 0x80, 0xD2 }; 
#endif

#ifdef __riscv
    // nop, fmv.s fa0, fa5
    char nop4b[8] = { 0x13, 0x00, 0x00, 0x00, 0x53, 0x85, 0xf7, 0x20 };

    // hack this to deal with graviton 1 / A72
    // nop + mov x0, 0
    char nop8b[8] = { 0x13, 0x00, 0x00, 0x00, 0x53, 0x85, 0xf7, 0x20  }; 
    // mov x0, 0 + ldr x0, [sp] 
    char nop8b1[8] = { 0x13, 0x00, 0x00, 0x00, 0xe0, 0x03, 0x40, 0xf9 };  
#endif 
    
    int specialFill = 0;
    uint64_t *nop8bptr;
    if (nopSize == 8) nop8bptr = (uint64_t *)(nop8b);
    else if (nopSize == 4) nop8bptr = (uint64_t *)(nop4b);
    #ifdef __x86_64
    else if (nopSize == 2) nop8bptr = (uint64_t *)(nop2b_xor);
    else if (nopSize == 3) nop8bptr = (uint64_t *)(k8_nop4b);
    else if (nopSize == -1) {
        // Special case for calls.
        // [ cacheline ]    [ cacheline ]
        //  call ---------->         ret
        // each call+ret will take 128B
        // Size is in KB so it's guaranteed to be divisible by 128B
        // Each 1 KB block has eight 128B blocks
        uint64_t callCount = sizeKb * 8;
        char *instrArr = (char *)nops;
        for (uint64_t callIdx = 0; callIdx < callCount; callIdx++) {
            uint64_t callOffset = 64 * callIdx;
            uint32_t callDestinationOffsetInArray = (sizeKb * 1024) / 2 + 64 * callIdx;
            // call instruction: E8 [4B relative displacement], 5B total. 
            instrArr[callOffset] = 0xE8;
            uint32_t *relativeDisplacementPtr = (uint32_t*)(instrArr + callOffset + 1);
            *relativeDisplacementPtr = callDestinationOffsetInArray - callOffset - 5;

            // pad out rest of 64B with NOPs, but no more than 8B per NOP
            // finish out first 8B segment with a 3B NOP
            instrArr[callOffset + 5] = 0x0F;
            instrArr[callOffset + 6] = 0x1F;
            instrArr[callOffset + 7] = 0;

            // Then pad out the rest with 7x 8B NOPs
            nop8bptr = (uint64_t *)(nop8b);
            for (int nop8bIdx = 0; nop8bIdx < 7; nop8bIdx++) {
                *(uint64_t *)(instrArr + callOffset + 8 * (nop8bIdx + 1)) = *nop8bptr;
            }

            // Last call block should have a return at the end
            if (callIdx == callCount - 1) {
                WriteReturn8BBlock(instrArr + callOffset + 56);
            }

            // 7x 8B NOPs in call target
            for (int nop8bIdx = 0; nop8bIdx < 7; nop8bIdx++) {
                *(uint64_t *)(instrArr + callDestinationOffsetInArray + (8 * nop8bIdx)) = *nop8bptr;
            }

            WriteReturn8BBlock(instrArr + callDestinationOffsetInArray + 56);
        }

        specialFill = 1;
    }
    #endif
    else {
        fprintf(stderr, "%d byte instruction length isn't supported :(\n", nopSize);
    }

    uint64_t elements = sizeKb * 1024 / 8 - 1;
    if (!specialFill) {
        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {
            nops[nopIdx] = *nop8bptr;
#ifdef __x86_64
            uint64_t *nopBranchPtr = (uint64_t *)nop4b_with_branch;
            if (branchInterval > 1 && nopIdx % branchInterval == 0) nops[nopIdx] = *nopBranchPtr;
#endif
#ifdef __aarch64__
            if (nopSize == 8) {
                  uint64_t *otherNops = (uint64_t *)nop8b1;
                  if (nopIdx & 1) nops[nopIdx] = *otherNops;
            }
#endif
        }
        
        // ret
        #ifdef __x86_64
        unsigned char *functionEnd = (unsigned char *)(nops + elements);
        functionEnd[0] = 0xC3;
        #endif
        #ifdef __aarch64__
        uint64_t *functionEnd = (uint64_t *)(nops + elements);
        functionEnd[0] = 0XD65F03C0;
        //flush_icache((void *)nops, funcLen);
        __builtin___clear_cache(nops, functionEnd);
        #endif
        #ifdef __riscv
        uint64_t *functionEnd = (unsigned char *)(nops + elements);
        functionEnd[0] = 0x8082;
        #endif 
    }

#ifndef HUGEPAGE_HACK
    size_t funcLen = sizeKb * 1024;
    uint64_t nopfuncPage = (~0xFFF) & (uint64_t)(nops);
    size_t mprotectLen = (0xFFF & (uint64_t)(nops)) + funcLen;
    
    if (mprotect((void *)nopfuncPage, mprotectLen, PROT_EXEC | PROT_READ | PROT_WRITE) < 0) {
        fprintf(stderr, "mprotect failed, errno %d\n", errno);
    }
#endif
}

// If coreNode and memNode are set, use the specified numa config
// otherwise if numa is set to stripe or seq, respect that
float MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shared, int nopBytes, int coreNode, int memNode) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    float bw = 0;
    uint64_t elements = sizeKb * 1024 / sizeof(float);

    if (!shared && sizeKb < threads) {
        fprintf(stderr, "Too many threads for this test size\n");
        return 0;
    }

    // make sure this is divisble by 512 bytes, since the unrolled asm loop depends on that
    // it's hard enough to get close to theoretical L1D BW as is, so we don't want additional cmovs or branches
    // in the hot loop
    uint64_t private_elements = ceil((double)sizeKb / (double)threads) * 256;
    //fprintf(stderr, "Actual data: %lu B\n", private_elements * 4 * threads);
    //fprintf(stderr, "Data per thread: %lu B\n", private_elements * 4);

    // make array and fill it with something, if shared
    float* testArr = NULL;
    if (shared){
        //testArr = (float*)aligned_alloc(64, elements * sizeof(float));
        testArr = allocate_memory(elements * sizeof(float), 0);
        if (testArr == NULL) {
                fprintf(stderr, "Could not allocate memory\n");
                return 0;
        }

        if (nopBytes == 0) {
            for (uint64_t i = 0; i < elements; i++) {
                testArr[i] = i + 0.5f;
            }
        } else FillInstructionArray((uint64_t *)testArr, sizeKb, nopBytes, branchInterval);
    }
    else
    {
        elements = private_elements; // will fill arrays below, per-thread
    }

    pthread_t* testThreads = (pthread_t*)malloc(threads * sizeof(pthread_t));
    struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData));
#ifdef NUMA
    // if numa, tell each thread to set an affinity mask
    struct bitmask *nodeBitmask = NULL;
    cpu_set_t cpuset;
    
    if (numa == NUMA_CROSSNODE) {
        nodeBitmask = numa_allocate_cpumask();
    int nprocs = get_nprocs();
        numa_node_to_cpus(coreNode, nodeBitmask); 
    CPU_ZERO(&cpuset);

    // provided functions for manipultaing bitmask don't work
    // for (int i = 0; i < nprocs; i++)
    //   if (numa_bitmask_isbitset(nodeBitmask, i)) CPU_SET(i, &cpuset);
    // bitmask has fields:
    // - size = number of bits
    // - maskp = pointer to bitmap
    // cpu_set_t has field __bits. have to assume it's CPU_SETSIZE bits
    // also assume bitmap size is divisible by 8 (byte size)
    memcpy(cpuset.__bits, nodeBitmask->maskp, nodeBitmask->size / 8);
    }
#endif

    for (uint64_t i = 0; i < threads; i++) {
        if (shared)
        {
            threadData[i].arr = testArr;
            threadData[i].iterations = iterations;
        }
        else
        {
#ifdef NUMA
            int cpuCount = get_nprocs();
            if (numa == NUMA_CROSSNODE) {
                threadData[i].arr = numa_alloc_onnode(elements * sizeof(float), memNode);
                threadData[i].cpuset = cpuset;
            } else if (numa) {
                // Figure out which nodes actually have CPUs and memory
                //int numaNodeCount = numa_max_node() + 1;
                int numaNodeCount = 4;   // for knl. geez
                if (numa == NUMA_SEQ) {
                    // unimplemented
                    fprintf(stderr, "sequential numa node fill not implemented yet\n");
                } else if (numa == NUMA_STRIPE) {
                    memNode = i % numaNodeCount;
                    coreNode = memNode;
                } else if (numa == NUMA_DOUBLE_CROSSNODE) {
                    // hardcode source nodes to 0,1 and destinations 2,3
		    // edit this later for one-off testing
                    coreNode = i & 1;
                    memNode = (i & 1);
                    fprintf(stderr, "Thread %d: Core %d -> mem %d\n", i, coreNode, memNode);
                }

                for(int cpuIdx = 0; cpuIdx < get_nprocs(); cpuIdx++) {
                    CPU_ZERO(&(threadData[i].cpuset));
                    if(CPU_ISSET(i, &(threadData[i].cpuset))) {
                        fprintf(stderr, "bitmask not cleared\n");
                    }
                }

                threadData[i].arr = numa_alloc_onnode(elements * sizeof(float), memNode);

                for(int cpuIdx = 0; cpuIdx < get_nprocs(); cpuIdx++) {
                    CPU_ZERO(&(threadData[i].cpuset));
                    if(CPU_ISSET(i, &(threadData[i].cpuset))) {
                        fprintf(stderr, "bitmask not cleared\n");
                    }
                }

                // cpu node affinity has to be set for each thread
                nodeBitmask = numa_allocate_cpumask();
                numa_node_to_cpus(coreNode, nodeBitmask); 
                CPU_ZERO(&(threadData[i].cpuset));
                fprintf(stderr, "\tNode %d has CPUs:", coreNode);
                for (int cpuIdx = 0; cpuIdx < cpuCount; cpuIdx++) { 
                    if (numa_bitmask_isbitset(nodeBitmask, cpuIdx))  {
                        CPU_SET(cpuIdx, &(threadData[i].cpuset)); 
                    }
                }
            } else {
#endif
                // Not NUMA aware. Allocate memory normally
		//threadData[i].arr = (float*)aligned_alloc(64, elements * sizeof(float));
                threadData[i].arr = allocate_memory(elements * sizeof(float), i);
                if (threadData[i].arr == NULL)
                {
                    fprintf(stderr, "Could not allocate memory for thread %ld\n", i);
                    return 0;
                }
#ifdef NUMA
	}
#endif

        if (nopBytes == 0) {
            for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {
                threadData[i].arr[arr_idx] = arr_idx + i + 0.5f;
            }
        } else FillInstructionArray((uint64_t *)threadData[i].arr, elements * sizeof(float) / 1024, nopBytes, branchInterval);

            threadData[i].iterations = iterations * threads;
        }

        threadData[i].arr_length = elements;
        threadData[i].bw = 0;
        threadData[i].start = 0;
        //if (elements > 8192 * 1024) threadData[i].start = 4096 * i; // must be multiple of 128 because of unrolling
        //int pthreadRc = pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));
    }

#ifndef __MINGW32__
    if (pmon) start_perf_monitoring();
#endif
    gettimeofday(&startTv, &startTz);
    for (uint64_t i = 0; i < threads; i++) pthread_create(testThreads + i, NULL, ReadBandwidthTestThread, (void *)(threadData + i));
    for (uint64_t i = 0; i < threads; i++) pthread_join(testThreads[i], NULL);
    gettimeofday(&endTv, &endTz);
#ifndef __MINGW32__
    if (pmon) stop_perf_monitoring();
#endif

    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    double gbTransferred = iterations * sizeof(float) * elements * threads / (double)1e9;
    bw = 1000 * gbTransferred / (double)time_diff_ms;
    if (!shared) bw = bw * threads; // iteration count is divided by thread count if in thread private mode
    //printf("%f GB, %lu ms\n", gbTransferred, time_diff_ms);
#ifdef NUMA
    if (numa) numa_free_cpumask(nodeBitmask);
#endif
    free(testThreads);
    #ifndef HUGEPAGE_HACK
    free(testArr); // should be null in not-shared (private) mode
    #endif

    if (!shared) {
        for (uint64_t i = 0; i < threads; i++) {
#ifdef NUMA
        if (numa) numa_free(threadData[i].arr, elements * sizeof(float));
        else
#endif
#ifndef HUGEPAGE_HACK
            free(threadData[i].arr);
#endif
        }
    }

    free(threadData);
    return bw;
}

// one place to make memory allocation calls
#define HUGEPAGE_HACK_SIZE (1048576*1024)
void *hugepageBuffer = NULL;
void *allocate_memory(size_t bytes, unsigned int threadOffset)
{
    void *dst = NULL;
    #ifndef HUGEPAGE_HACK
    int posix_memalign_rc = 0;
    if (posix_memalign_rc != posix_memalign((void **)(&dst), 64, bytes)) {
        fprintf(stderr, "Could not allocate memory: %d\n", posix_memalign_rc);
        return NULL;
    }

    return dst;
    #else
    // todo: make this less of a hack
    if (hugepageBuffer == NULL)
    {
        hugepageBuffer = mmap(NULL, HUGEPAGE_HACK_SIZE, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
        if (hugepageBuffer == NULL)
        {
            fprintf(stderr, "Could not mmap memory with hugetlb\n");
            return NULL;
        }

        if (threadOffset * bytes + bytes > HUGEPAGE_HACK_SIZE)
        {
            fprintf(stderr, "Oh no\n");
            return NULL;
        }
    }

    // fprintf(stderr, "Array offset for thread %d is %llu KB\n", threadOffset, bytes * threadOffset / 1024);
    return (void *)((char *)hugepageBuffer + (bytes * threadOffset));
    #endif
}

#ifdef __x86_64
__attribute((ms_abi)) float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {
#else
float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) {
#endif
    float sum = 0;
    if (start + 16 >= arr_length) return 0;

    uint64_t iter_idx = 0, i = start;
    float s1 = 0, s2 = 1, s3 = 0, s4 = 1, s5 = 0, s6 = 1, s7 = 0, s8 = 1;
    while (iter_idx < iterations) {
        s1 += arr[i];
        s2 *= arr[i + 1];
        s3 += arr[i + 2];
        s4 *= arr[i + 3];
        s5 += arr[i + 4];
        s6 *= arr[i + 5];
        s7 += arr[i + 6];
        s8 *= arr[i + 7];
        i += 8;
        if (i + 7 >= arr_length) i = 0;
        if (i == start) iter_idx++;
    }

    sum += s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8;

    return sum;
}

void *ReadBandwidthTestThread(void *param) {
    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;
    if (hardaffinity) sched_setaffinity(gettid(), sizeof(cpu_set_t), &global_cpuset);
#ifdef NUMA
    if (numa) {
        int affinity_rc = sched_setaffinity(gettid(), sizeof(cpu_set_t), &(bwTestData->cpuset));
    if (affinity_rc != 0) {
        fprintf(stderr, "wtf set affinity failed: %s\n",strerror(errno));
        
    }
    }
#endif
    float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations, bwTestData->start);
    if (sum == 0) printf("woohoo\n");
    pthread_exit(NULL);
}


================================================
FILE: MemoryBandwidth/MemoryBandwidth_arm.s
================================================
.arch armv8-a
.text

.global asm_read
.global asm_write
.global asm_cflip
.global asm_copy
.global asm_add
.global flush_icache
.global readbankconflict
.global readbankconflict128

.global _asm_read
.global _asm_write
.global _asm_cflip
.global _asm_copy
.global _asm_add
.global _flush_icache
.global _readbankconflict

.balign 4

/* x0 = ptr to array (was rcx)
 * x1 = arr length (was rdx)
 * x2 = iterations (was r8)
 * x3 = start (was r9)
 */
_asm_read:
asm_read:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  sub x1, x1, 128
  mov x14, x3     /* set x14 = index into array to start location (x3) */
  eor x13, x13, x13 /* x13 = 0 (for comparison) */
asm_read_pass_loop:
  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */
  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q22, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q22, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q22, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q22, [x15, 112]
  add x14, x14, 32

  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */
  csel x14, x13, x14, LT
  cmp x14, x3
  b.ne asm_read_pass_loop /* skip iteration decrement if we're not back to start */
  sub x2, x2, 1
  cbnz x2, asm_read_pass_loop
  add v0.4s, v16.4s, v16.4s
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret

_asm_write:
asm_write:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  sub x1, x1, 128 /* last iteration: rsi == rdx. rsi > rdx = break */
  mov x14, x3     /* set x14 = index into array to start location (x3) */
  eor x13, x13, x13 /* x13 = 0 (for comparison) */
  ldr q16, [x0]
asm_write_pass_loop:
  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */
  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */
  str q16, [x15]
  str q16, [x15, 16]
  str q16, [x15, 32]
  str q16, [x15, 48]
  str q16, [x15, 64]
  str q16, [x15, 80]
  str q16, [x15, 96]
  str q16, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  str q16, [x15]
  str q16, [x15, 16]
  str q16, [x15, 32]
  str q16, [x15, 48]
  str q16, [x15, 64]
  str q16, [x15, 80]
  str q16, [x15, 96]
  str q16, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  str q16, [x15]
  str q16, [x15, 16]
  str q16, [x15, 32]
  str q16, [x15, 48]
  str q16, [x15, 64]
  str q16, [x15, 80]
  str q16, [x15, 96]
  str q16, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  str q16, [x15]
  str q16, [x15, 16]
  str q16, [x15, 32]
  str q16, [x15, 48]
  str q16, [x15, 64]
  str q16, [x15, 80]
  str q16, [x15, 96]
  str q16, [x15, 112]
  add x14, x14, 32

  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */
  csel x14, x13, x14, LT
  cmp x14, x3
  b.ne asm_write_pass_loop /* skip iteration decrement if we're not back to start */
  sub x2, x2, 1
  cbnz x2, asm_write_pass_loop
  add v0.4s, v16.4s, v16.4s
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret

_asm_cflip:
asm_cflip:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  sub x1, x1, 128
  mov x14, x3     /* set x14 = index into array to start location (x3) */
  eor x13, x13, x13 /* x13 = 0 (for comparison) */
asm_cflip_pass_loop:
  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */
  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  str q16, [x15, 48]
  str q17, [x15, 32]
  str q18, [x15, 16]
  str q19, [x15]
  ldr q16, [x15, 64]
  ldr q17, [x15, 80]
  ldr q18, [x15, 96]
  ldr q19, [x15, 112]
  str q16, [x15, 112]
  str q17, [x15, 96]
  str q18, [x15, 80]
  str q19, [x15, 64]

  add x14, x14, 32
  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  str q16, [x15, 48]
  str q17, [x15, 32]
  str q18, [x15, 16]
  str q19, [x15]
  ldr q16, [x15, 64]
  ldr q17, [x15, 80]
  ldr q18, [x15, 96]
  ldr q19, [x15, 112]
  str q16, [x15, 112]
  str q17, [x15, 96]
  str q18, [x15, 80]
  str q19, [x15, 64]

  add x14, x14, 32
  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  str q16, [x15, 48]
  str q17, [x15, 32]
  str q18, [x15, 16]
  str q19, [x15]
  ldr q16, [x15, 64]
  ldr q17, [x15, 80]
  ldr q18, [x15, 96]
  ldr q19, [x15, 112]
  str q16, [x15, 112]
  str q17, [x15, 96]
  str q18, [x15, 80]
  str q19, [x15, 64]

  add x14, x14, 32
  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  str q16, [x15, 48]
  str q17, [x15, 32]
  str q18, [x15, 16]
  str q19, [x15]
  ldr q16, [x15, 64]
  ldr q17, [x15, 80]
  ldr q18, [x15, 96]
  ldr q19, [x15, 112]
  str q16, [x15, 112]
  str q17, [x15, 96]
  str q18, [x15, 80]
  str q19, [x15, 64]

  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */
  csel x14, x13, x14, LT
  cmp x14, x3
  b.ne asm_cflip_pass_loop /* skip iteration decrement if we're not back to start */
  sub x2, x2, 2
  cbnz x2, asm_cflip_pass_loop
  add v0.4s, v16.4s, v16.4s
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret

/* x0 = ptr to array (was rcx)
 * x1 = arr length (was rdx)
 * x2 = iterations (was r8)
 * x3 = start (was r9)
 */
_asm_copy:
asm_copy:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  asr x11, x1, 1    /* x11 = destination index (length / 2) */
  sub x1, x1, 128
  mov x10, x11      /* use x10 as index into destination */
  mov x14, x3     /* set x14 = index into array to start location (x3) */
  eor x13, x13, x13 /* x13 = 0 (for comparison) */
asm_copy_pass_loop:
  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */
  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */
  lsl x12, x10, 2  /* x12 = x10 * 4, to calculate destination */
  add x9, x0, x12  /* x9 = ptr to destination */
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  str q16, [x9]
  str q17, [x9, 16]
  str q18, [x9, 32]
  str q19, [x9, 48]
  str q20, [x9, 64]
  str q21, [x9, 80]
  str q22, [x9, 96]
  str q23, [x9, 112]
  add x14, x14, 32
  add x10, x10, 32

  lsl x12, x14, 2
  add x15, x0, x12
  lsl x12, x10, 2
  add x9, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  str q16, [x9]
  str q17, [x9, 16]
  str q18, [x9, 32]
  str q19, [x9, 48]
  str q20, [x9, 64]
  str q21, [x9, 80]
  str q22, [x9, 96]
  str q23, [x9, 112]
  add x14, x14, 32
  add x10, x10, 32

  cmp x1, x10 /* if destination hits end, loop around */
  csel x14, x13, x14, LT
  csel x10, x11, x10, LT
  cmp x14, x3
  b.ne asm_copy_pass_loop /* skip iteration decrement if we're not back to start */
  sub x2, x2, 1
  cbnz x2, asm_copy_pass_loop
  add v0.4s, v16.4s, v16.4s
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret

/* x0 = ptr to array (was rcx)
 * x1 = arr length (was rdx)
 * x2 = iterations (was r8)
 * x3 = start (was r9)
 */
asm_add:
_asm_add:
  sub sp, sp, #0x30
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  sub x1, x1, 128
  mov x14, x3     /* set x14 = index into array to start location (x3) */
  eor x13, x13, x13 /* x13 = 0 (for comparison) */
  ldr q15, [x0]
asm_add_pass_loop:
  lsl x12, x14, 2  /* x12 = x14 * 4, because float is 4B */
  add x15, x0, x12 /* ptr (x15) to next element = x0 (base) + x12 (index *4) */
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  add v16.4s, v16.4s, v15.4s
  add v17.4s, v17.4s, v15.4s
  add v18.4s, v18.4s, v15.4s
  add v19.4s, v19.4s, v15.4s
  add v20.4s, v20.4s, v15.4s
  add v21.4s, v21.4s, v15.4s
  add v22.4s, v22.4s, v15.4s
  add v23.4s, v23.4s, v15.4s
  str q16, [x15]
  str q17, [x15, 16]
  str q18, [x15, 32]
  str q19, [x15, 48]
  str q20, [x15, 64]
  str q21, [x15, 80]
  str q22, [x15, 96]
  str q23, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  add v16.4s, v16.4s, v15.4s
  add v17.4s, v17.4s, v15.4s
  add v18.4s, v18.4s, v15.4s
  add v19.4s, v19.4s, v15.4s
  add v20.4s, v20.4s, v15.4s
  add v21.4s, v21.4s, v15.4s
  add v22.4s, v22.4s, v15.4s
  add v23.4s, v23.4s, v15.4s
  str q16, [x15]
  str q17, [x15, 16]
  str q18, [x15, 32]
  str q19, [x15, 48]
  str q20, [x15, 64]
  str q21, [x15, 80]
  str q22, [x15, 96]
  str q23, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  add v16.4s, v16.4s, v15.4s
  add v17.4s, v17.4s, v15.4s
  add v18.4s, v18.4s, v15.4s
  add v19.4s, v19.4s, v15.4s
  add v20.4s, v20.4s, v15.4s
  add v21.4s, v21.4s, v15.4s
  add v22.4s, v22.4s, v15.4s
  add v23.4s, v23.4s, v15.4s
  str q16, [x15]
  str q17, [x15, 16]
  str q18, [x15, 32]
  str q19, [x15, 48]
  str q20, [x15, 64]
  str q21, [x15, 80]
  str q22, [x15, 96]
  str q23, [x15, 112]
  add x14, x14, 32

  lsl x12, x14, 2
  add x15, x0, x12
  ldr q16, [x15]
  ldr q17, [x15, 16]
  ldr q18, [x15, 32]
  ldr q19, [x15, 48]
  ldr q20, [x15, 64]
  ldr q21, [x15, 80]
  ldr q22, [x15, 96]
  ldr q23, [x15, 112]
  add v16.4s, v16.4s, v15.4s
  add v17.4s, v17.4s, v15.4s
  add v18.4s, v18.4s, v15.4s
  add v19.4s, v19.4s, v15.4s
  add v20.4s, v20.4s, v15.4s
  add v21.4s, v21.4s, v15.4s
  add v22.4s, v22.4s, v15.4s
  add v23.4s, v23.4s, v15.4s
  str q16, [x15]
  str q17, [x15, 16]
  str q18, [x15, 32]
  str q19, [x15, 48]
  str q20, [x15, 64]
  str q21, [x15, 80]
  str q22, [x15, 96]
  str q23, [x15, 112]
  add x14, x14, 32

  cmp x1, x14 /* if x1 (len - 128) - x14 < 0, loop back around */
  csel x14, x13, x14, LT
  cmp x14, x3
  b.ne asm_add_pass_loop /* skip iteration decrement if we're not back to start */
  sub x2, x2, 2
  cmp x2, 0
  b.gt asm_add_pass_loop
  ldr q0, [x0]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x30
  ret


/* Tests for cache bank conflicts by reading from two locations, spaced by some
   number of bytes
   x0 = ptr to array. first 32-bit int = increment step, because I'm too lazy to mess with the stack
   x1 = array length, in bytes
   x2 = load spacing, in bytes
   x3 = iter count (number of loads to execute) */
readbankconflict:
readbankconflict128:
_readbankconflict:
   sub sp, sp, #0x40
   stp x14, x15, [sp, #0x10]
   stp x12, x13, [sp, #0x20]
   stp x10, x11, [sp, #0x30]
   cmp x1, x2               /* basic check - subtract load spacing from array len */
   b.le readbankconflict_end /* exit immediately if we don't have enough space to iterate */
   sub x12, x1, 20          /* use x12 to check bytes remaining */
   mov x14, x0
   add x13, x0, x2           /* x14 = first load location, x13 = second load location */
   sub x12, x12, 20          /* we're reading 20B ahead */
   ldr x11, [x0]   /* increment, not used right now */
readbankconflict_loop:
   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   ldr x10, [x14]
   ldr x15, [x13]
   add x14, x14, 1
   add x13, x13, 1

   sub x12, x12, 20
   sub x3, x3, 20
   cmp x3, 0
   b.le readbankconflict_end  /* iteration count = exit condition */
   cmp x12, 0                 /* check bytes remaining */
   b.ge readbankconflict_loop /* if positive or equal, continue loop */
   sub x12, x1, 20     /* reset bytes remaining */
   mov x14, x1
   add x13, x1, x2
   b readbankconflict_loop
readbankconflict_end:
   ldp x10, x11, [sp, #0x30]
   ldp x12, x13, [sp, #0x20]
   ldp x14, x15, [sp, #0x10]
   add sp, sp, #0x40
   ret

/* x0: ptr to array
   x1: array size in bytes */
flush_icache:
_flush_icache:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  asr x0, x0, 6   /* align to 64B cacheline */
  lsl x0, x0, 6
  mov x14, x0
  mov x15, x1
flush_icache_clean_dcache_loop:
  dc civac, x14
  add x14, x14, 64
  sub x15, x15, 64
  b.gt flush_icache_clean_dcache_loop
  dsb ish
  mov x14, x0
  mov x15, x1
flush_icache_clean_icache_loop:
  ic ivau, x14
  add x14, x14, 64
  sub x15, x15, 64
  b.gt flush_icache_clean_icache_loop
  dsb ish
  isb
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret


================================================
FILE: MemoryBandwidth/MemoryBandwidth_riscv.s
================================================
.text

.global asm_read
.global asm_write
.global asm_cflip
.global asm_copy
.global asm_add
.global flush_icache
.global readbankconflict
.global readbankconflict128

/* a0 = arr, a1 = arr_len, a2 = iterations */
asm_read:
  addi sp, sp, -16
  sd s0, (sp)
  mv t4, x0
  addi t4, t4, 16     /* ??? */
  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/
  mv t4, t0
  mv t1, a0           /* t1 = current address */
  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */
  slli a1, a1, 2      /* get array length in bytes */
  add a1, a1, a0      /* array bound address */
  sub a1, a1, t0      /* make sure there's enough room for the last iteration */
  mv t3, x0           /* t3 = iteration counter */

  /* precompute offsets */
  slli t4, t4, 2  /* vec len given in number of 32-bit elements */
  mv t5, t4
  mv t6, t4
  slli t5, t5, 1 /* vec len * 2 */
  add t6, t6, t5 /* t6 = vec len * 3 */
asm_read_pass_loop:
  vlw.v v0, (t1)
  add s0, t1, t4
  vlw.v v1, (s0)
  add s0, t1, t5
  vlw.v v2, (s0)
  add s0, t1, t6
  vlw.v v3, (s0)
  add t1, t1, t0      /* increment address */
  blt t1, a1, asm_read_pass_loop
  addi t3, t3, 1
  mv t1, a0           /* reset array addr*/
  blt t3, a2, asm_read_pass_loop
  ld s0, (sp)
  addi sp, sp, 16
  fld fa0, (a0)
  ret

asm_write:
  addi sp, sp, -16
  sd s0, (sp)
  mv t4, x0
  addi t4, t4, 16     /* ??? */
  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/
  mv t4, t0
  mv t1, a0           /* t1 = current address */
  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */
  slli a1, a1, 2      /* get array length in bytes */
  add a1, a1, a0      /* array bound address */
  sub a1, a1, t0      /* make sure there's enough room for the last iteration */
  mv t3, x0           /* t3 = iteration counter */

  /* precompute offsets */
  slli t4, t4, 2  /* vec len given in number of 32-bit elements */
  mv t5, t4
  mv t6, t4
  slli t5, t5, 1 /* vec len * 2 */
  add t6, t6, t5 /* t6 = vec len * 3 */
  vlw.v v0, (a0)
asm_write_pass_loop:
  vsw.v v0, (t1)
  add s0, t1, t4
  vsw.v v1, (s0)
  add s0, t1, t5
  vsw.v v2, (s0)
  add s0, t1, t6
  vsw.v v3, (s0)
  add t1, t1, t0      /* increment address */
  blt t1, a1, asm_write_pass_loop
  addi t3, t3, 1
  mv t1, a0           /* reset array addr*/
  blt t3, a2, asm_write_pass_loop
  ld s0, (sp)
  addi sp, sp, 16
  fld fa0, (a0)
  ret

asm_copy:
  ret

asm_add:
  addi sp, sp, -16
  sd s0, (sp)
  mv t4, x0
  addi t4, t4, 16     /* ??? */
  vsetvli t0, t4, e32  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/
  mv t4, t0
  mv t1, a0           /* t1 = current address */
  slli t0, t0, 4      /* increment every loop by vec len * unroll factor of 4 */
  slli a1, a1, 2      /* get array length in bytes */
  add a1, a1, a0      /* array bound address */
  sub a1, a1, t0      /* make sure there's enough room for the last iteration */
  mv t3, x0           /* t3 = iteration counter */

  /* precompute offsets */
  slli t4, t4, 2  /* vec len given in number of 32-bit elements */
  mv t5, t4
  mv t6, t4
  slli t5, t5, 1 /* vec len * 2 */
  add t6, t6, t5 /* t6 = vec len * 3 */
  vlw.v v4, (a0)
asm_add_pass_loop:
  vlw.v v0, (t1)
  vadd.vv v0, v0, v4
  vsw.v v0, (t1)

  add s0, t1, t4
  vlw.v v1, (s0)
  vadd.vv v1, v1, v4
  vsw.v v1, (s0)

  add s0, t1, t5
  vlw.v v2, (s0)
  vadd.vv v2, v2, v4
  vsw.v v2, (s0)

  add s0, t1, t6
  vlw.v v3, (s0)
  vadd.vv v3, v3, v4
  vsw.v v3, (s0)

  add t1, t1, t0      /* increment address */
  blt t1, a1, asm_add_pass_loop
  addi t3, t3, 1
  mv t1, a0           /* reset array addr*/
  blt t3, a2, asm_add_pass_loop
  ld s0, (sp)
  addi sp, sp, 16
  fld fa0, (a0)
  ret

asm_cflip:
  ret

readbankconflict:
  ret

readbankconflict128:
  ret


================================================
FILE: MemoryBandwidth/MemoryBandwidth_x86.s
================================================
.text

.global asm_read
.global asm_write
.global asm_copy
.global asm_cflip
.global asm_add
.global sse_read
.global sse_write
.global sse_ntwrite
.global avx512_read
.global avx512_write
.global avx512_copy
.global avx512_add
.global readbankconflict
.global readbankconflict128

.global repstosd_write
.global repstosb_write
.global repmovsb_copy
.global repmovsd_copy

asm_read:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
avx_asm_read_pass_loop:

  vmovaps (%rdi), %ymm0
  vmovaps 32(%rdi), %ymm1
  vmovaps 64(%rdi), %ymm2
  vmovaps 96(%rdi), %ymm3
  vmovaps 128(%rdi), %ymm0
  vmovaps 160(%rdi), %ymm1
  vmovaps 192(%rdi), %ymm2
  vmovaps 224(%rdi), %ymm3
  add $64, %rsi
  add %r15, %rdi
  vmovaps (%rdi), %ymm0
  vmovaps 32(%rdi), %ymm1
  vmovaps 64(%rdi), %ymm2
  vmovaps 96(%rdi), %ymm3
  vmovaps 128(%rdi), %ymm0
  vmovaps 160(%rdi), %ymm1
  vmovaps 192(%rdi), %ymm2
  vmovaps 224(%rdi), %ymm3
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge asm_avx_test_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
asm_avx_test_iteration_count:

  cmp %rsi, %r9
  jnz avx_asm_read_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz avx_asm_read_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

asm_write:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  vmovaps (%rcx), %ymm0
avx_asm_write_pass_loop:

  vmovaps %ymm0, (%rdi)
  vmovaps %ymm0, 32(%rdi)
  vmovaps %ymm0, 64(%rdi)
  vmovaps %ymm0, 96(%rdi)
  vmovaps %ymm0, 128(%rdi)
  vmovaps %ymm0, 160(%rdi)
  vmovaps %ymm0, 192(%rdi)
  vmovaps %ymm0, 224(%rdi)
  add $64, %rsi
  add %r15, %rdi
  vmovaps %ymm0, (%rdi)
  vmovaps %ymm0, 32(%rdi)
  vmovaps %ymm0, 64(%rdi)
  vmovaps %ymm0, 96(%rdi)
  vmovaps %ymm0, 128(%rdi)
  vmovaps %ymm0, 160(%rdi)
  vmovaps %ymm0, 192(%rdi)
  vmovaps %ymm0, 224(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge asm_avx_write_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
asm_avx_write_iteration_count:

  cmp %rsi, %r9
  jnz avx_asm_write_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz avx_asm_write_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

/* rcx = ptr to arr
   rdx = arr_length
   r8 = iterations */
asm_copy:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  push %r13
  xor %rsi, %rsi
  mov %rdx, %r9
  shr $1, %r9    /* start destination at array + length / 2 */
  mov $256, %r15 /* load in blocks of 128 bytes */
  mov %r9, %r13
  sub $64, %r13 /* place loop limit 256B before end */
  lea (%rcx,%rsi,4), %rdi
  lea (%rcx,%r9,4), %r14
avx_asm_copy_pass_loop:

  vmovaps (%rdi), %ymm0
  vmovaps 32(%rdi), %ymm1
  vmovaps 64(%rdi), %ymm2
  vmovaps 96(%rdi), %ymm3
  vmovaps 128(%rdi), %ymm4
  vmovaps 160(%rdi), %ymm5
  vmovaps 192(%rdi), %ymm6
  vmovaps 224(%rdi), %ymm7
  vmovaps %ymm0, (%r14)
  vmovaps %ymm1, 32(%r14)
  vmovaps %ymm2, 64(%r14)
  vmovaps %ymm3, 96(%r14)
  vmovaps %ymm4, 128(%r14)
  vmovaps %ymm5, 160(%r14)
  vmovaps %ymm6, 192(%r14)
  vmovaps %ymm7, 224(%r14)
  add $64, %rsi
  add %r15, %rdi  /* increment src/dst pointers */
  add %r15, %r14
  cmp %rsi, %r13   /* end location is at half */
  jge avx_asm_copy_pass_loop
  xor %rsi, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
  lea (%rcx,%r9,4), %r14
  dec %r8                 /* decrement iteration counter */
  jnz avx_asm_copy_pass_loop
  pop %r13
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret


asm_cflip:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
avx_asm_cflip_pass_loop:

  vmovaps (%rdi), %ymm0
  vmovaps 32(%rdi), %ymm1
  vmovaps 64(%rdi), %ymm2
  vmovaps 96(%rdi), %ymm3
  vmovaps %ymm0, 96(%rdi)
  vmovaps %ymm1, 64(%rdi)
  vmovaps %ymm2, 32(%rdi)
  vmovaps %ymm3, (%rdi)
  vmovaps 128(%rdi), %ymm0
  vmovaps 160(%rdi), %ymm1
  vmovaps 192(%rdi), %ymm2
  vmovaps 224(%rdi), %ymm3
  vmovaps %ymm0, 224(%rdi)
  vmovaps %ymm1, 192(%rdi)
  vmovaps %ymm2, 160(%rdi)
  vmovaps %ymm3, 128(%rdi)
  add $64, %rsi
  add %r15, %rdi
  vmovaps (%rdi), %ymm0
  vmovaps 32(%rdi), %ymm1
  vmovaps 64(%rdi), %ymm2
  vmovaps 96(%rdi), %ymm3
  vmovaps %ymm0, 96(%rdi)
  vmovaps %ymm1, 64(%rdi)
  vmovaps %ymm2, 32(%rdi)
  vmovaps %ymm3, (%rdi)
  vmovaps 128(%rdi), %ymm0
  vmovaps 160(%rdi), %ymm1
  vmovaps 192(%rdi), %ymm2
  vmovaps 224(%rdi), %ymm3
  vmovaps %ymm0, 224(%rdi)
  vmovaps %ymm1, 192(%rdi)
  vmovaps %ymm2, 160(%rdi)
  vmovaps %ymm3, 128(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge asm_avx_cflip_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
asm_avx_cflip_iteration_count:
  cmp %rsi, %r9
  jnz avx_asm_cflip_pass_loop /* skip iteration decrement if we're not back to start */
  sub $2, %r8  /* each iteration counts as two (hitting each element twice) */
  jnz avx_asm_cflip_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

asm_add:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  vmovaps (%rdi), %ymm4
avx_asm_add_pass_loop:
  vaddps (%rdi), %ymm4, %ymm0
  vaddps 32(%rdi), %ymm4, %ymm1
  vaddps 64(%rdi), %ymm4, %ymm2
  vaddps 96(%rdi), %ymm4, %ymm3
  vmovaps %ymm0, (%rdi)
  vmovaps %ymm1, 32(%rdi)
  vmovaps %ymm2, 64(%rdi)
  vmovaps %ymm3, 96(%rdi)
  vaddps 128(%rdi), %ymm4, %ymm0
  vaddps 160(%rdi), %ymm4, %ymm1
  vaddps 192(%rdi), %ymm4, %ymm2
  vaddps 224(%rdi), %ymm4, %ymm3
  vmovaps %ymm0, 128(%rdi)
  vmovaps %ymm1, 160(%rdi)
  vmovaps %ymm2, 192(%rdi)
  vmovaps %ymm3, 224(%rdi)
  add $64, %rsi
  add %r15, %rdi
  vaddps (%rdi), %ymm4, %ymm0
  vaddps 32(%rdi), %ymm4, %ymm1
  vaddps 64(%rdi), %ymm4, %ymm2
  vaddps 96(%rdi), %ymm4, %ymm3
  vmovaps %ymm0, (%rdi)
  vmovaps %ymm1, 32(%rdi)
  vmovaps %ymm2, 64(%rdi)
  vmovaps %ymm3, 96(%rdi)
  vaddps 128(%rdi), %ymm4, %ymm0
  vaddps 160(%rdi), %ymm4, %ymm1
  vaddps 192(%rdi), %ymm4, %ymm2
  vaddps 224(%rdi), %ymm4, %ymm3
  vmovaps %ymm0, 128(%rdi)
  vmovaps %ymm1, 160(%rdi)
  vmovaps %ymm2, 192(%rdi)
  vmovaps %ymm3, 224(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge asm_avx_add_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
asm_avx_add_iteration_count:
  cmp %rsi, %r9
  jnz avx_asm_add_pass_loop /* skip iteration decrement if we're not back to start */
  sub $2, %r8
  jg avx_asm_add_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  movss (%rdi), %xmm0
  ret

sse_read:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
sse_read_pass_loop:

  movaps (%rdi), %xmm0
  movaps 16(%rdi), %xmm1
  movaps 32(%rdi), %xmm2
  movaps 48(%rdi), %xmm3
  movaps 64(%rdi), %xmm0
  movaps 80(%rdi), %xmm1
  movaps 96(%rdi), %xmm2
  movaps 112(%rdi), %xmm3
  movaps 128(%rdi), %xmm0
  movaps 144(%rdi), %xmm1
  movaps 160(%rdi), %xmm2
  movaps 176(%rdi), %xmm3
  movaps 192(%rdi), %xmm0
  movaps 208(%rdi), %xmm1
  movaps 224(%rdi), %xmm2
  movaps 240(%rdi), %xmm3
  add $64, %rsi
  add %r15, %rdi
  movaps (%rdi), %xmm0
  movaps 16(%rdi), %xmm1
  movaps 32(%rdi), %xmm2
  movaps 48(%rdi), %xmm3
  movaps 64(%rdi), %xmm0
  movaps 80(%rdi), %xmm1
  movaps 96(%rdi), %xmm2
  movaps 112(%rdi), %xmm3
  movaps 128(%rdi), %xmm0
  movaps 144(%rdi), %xmm1
  movaps 160(%rdi), %xmm2
  movaps 176(%rdi), %xmm3
  movaps 192(%rdi), %xmm0
  movaps 208(%rdi), %xmm1
  movaps 224(%rdi), %xmm2
  movaps 240(%rdi), %xmm3
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge sse_test_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
sse_test_iteration_count:

  cmp %rsi, %r9
  jnz sse_read_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz sse_read_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

sse_write:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  movaps (%rdi), %xmm0
  movaps 16(%rdi), %xmm1
  movaps 32(%rdi), %xmm2
  movaps 48(%rdi), %xmm3
sse_write_pass_loop:

  movaps %xmm0, (%rdi)
  movaps %xmm1, 16(%rdi)
  movaps %xmm2, 32(%rdi)
  movaps %xmm3, 48(%rdi)
  movaps %xmm0, 64(%rdi)
  movaps %xmm1, 80(%rdi)
  movaps %xmm2, 96(%rdi)
  movaps %xmm3, 112(%rdi)
  movaps %xmm0, 128(%rdi)
  movaps %xmm1, 144(%rdi)
  movaps %xmm2, 160(%rdi)
  movaps %xmm3, 176(%rdi)
  movaps %xmm0, 192(%rdi)
  movaps %xmm1, 208(%rdi)
  movaps %xmm2, 224(%rdi)
  movaps %xmm3, 240(%rdi)
  add $64, %rsi
  add %r15, %rdi
  movaps %xmm0, (%rdi)
  movaps %xmm1, 16(%rdi)
  movaps %xmm2, 32(%rdi)
  movaps %xmm3, 48(%rdi)
  movaps %xmm0, 64(%rdi)
  movaps %xmm1, 80(%rdi)
  movaps %xmm2, 96(%rdi)
  movaps %xmm3, 112(%rdi)
  movaps %xmm0, 128(%rdi)
  movaps %xmm1, 144(%rdi)
  movaps %xmm2, 160(%rdi)
  movaps %xmm3, 176(%rdi)
  movaps %xmm0, 192(%rdi)
  movaps %xmm1, 208(%rdi)
  movaps %xmm2, 224(%rdi)
  movaps %xmm3, 240(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge sse_write_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
sse_write_iteration_count:

  cmp %rsi, %r9
  jnz sse_write_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz sse_write_pass_loop
  movaps (%rcx), %xmm0
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

sse_ntwrite:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  movaps (%rdi), %xmm0
  movaps 16(%rdi), %xmm1
  movaps 32(%rdi), %xmm2
  movaps 48(%rdi), %xmm3
sse_ntwrite_pass_loop:
  movntps %xmm0, (%rdi)
  movntps %xmm1, 16(%rdi)
  movntps %xmm2, 32(%rdi)
  movntps %xmm3, 48(%rdi)
  movntps %xmm0, 64(%rdi)
  movntps %xmm1, 80(%rdi)
  movntps %xmm2, 96(%rdi)
  movntps %xmm3, 112(%rdi)
  movntps %xmm0, 128(%rdi)
  movntps %xmm1, 144(%rdi)
  movntps %xmm2, 160(%rdi)
  movntps %xmm3, 176(%rdi)
  movntps %xmm0, 192(%rdi)
  movntps %xmm1, 208(%rdi)
  movntps %xmm2, 224(%rdi)
  movntps %xmm3, 240(%rdi)
  add $64, %rsi
  add %r15, %rdi
  movntps %xmm0, (%rdi)
  movntps %xmm1, 16(%rdi)
  movntps %xmm2, 32(%rdi)
  movntps %xmm3, 48(%rdi)
  movntps %xmm0, 64(%rdi)
  movntps %xmm1, 80(%rdi)
  movntps %xmm2, 96(%rdi)
  movntps %xmm3, 112(%rdi)
  movntps %xmm0, 128(%rdi)
  movntps %xmm1, 144(%rdi)
  movntps %xmm2, 160(%rdi)
  movntps %xmm3, 176(%rdi)
  movntps %xmm0, 192(%rdi)
  movntps %xmm1, 208(%rdi)
  movntps %xmm2, 224(%rdi)
  movntps %xmm3, 240(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge sse_ntwrite_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
sse_ntwrite_iteration_count:

  cmp %rsi, %r9
  jnz sse_ntwrite_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz sse_ntwrite_pass_loop
  movaps (%rcx), %xmm0
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret 


avx512_read:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
avx512_read_pass_loop:

  vmovaps (%rdi), %zmm0
  vmovaps 64(%rdi), %zmm1
  vmovaps 128(%rdi), %zmm2
  vmovaps 192(%rdi), %zmm3
  add $64, %rsi
  add %r15, %rdi
  vmovaps (%rdi), %zmm0
  vmovaps 64(%rdi), %zmm1
  vmovaps 128(%rdi), %zmm2
  vmovaps 192(%rdi), %zmm3
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge avx512_test_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
avx512_test_iteration_count:

  cmp %rsi, %r9
  jnz avx512_read_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz avx512_read_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

avx512_write:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $256, %r15 /* load in blocks of 256 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  vmovaps (%rdi), %zmm0
avx512_write_pass_loop:
  vmovaps %zmm0, (%rdi)
  vmovaps %zmm1, 64(%rdi)
  vmovaps %zmm2, 128(%rdi)
  vmovaps %zmm3, 192(%rdi)
  add $64, %rsi
  add %r15, %rdi
  vmovaps %zmm0, (%rdi)
  vmovaps %zmm1, 64(%rdi)
  vmovaps %zmm2, 128(%rdi)
  vmovaps %zmm3, 192(%rdi)
  add $64, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge avx512_write_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
avx512_write_iteration_count:

  cmp %rsi, %r9
  jnz avx512_write_pass_loop /* skip iteration decrement if we're not back to start */
  dec %r8
  jnz avx512_write_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

/* rcx = ptr to arr
   rdx = arr_length
   r8 = iterations */
avx512_copy:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  push %r13
  xor %rsi, %rsi
  mov %rdx, %r9
  shr $1, %r9    /* start destination at array + length / 2 */
  mov $256, %r15 /* load in blocks of 128 bytes */
  mov %r9, %r13
  sub $128, %r13 /* place loop limit 512B before end */
  lea (%rcx,%rsi,4), %rdi
  lea (%rcx,%r9,4), %r14
avx512_copy_pass_loop:

  vmovaps (%rdi), %zmm0
  vmovaps 64(%rdi), %zmm1
  vmovaps 128(%rdi), %zmm2
  vmovaps 192(%rdi), %zmm3
  vmovaps 256(%rdi), %zmm4
  vmovaps 320(%rdi), %zmm5
  vmovaps 384(%rdi), %zmm6
  vmovaps 448(%rdi), %zmm7
  vmovaps %zmm0, (%r14)
  vmovaps %zmm1, 64(%r14)
  vmovaps %zmm2, 128(%r14)
  vmovaps %zmm3, 192(%r14)
  vmovaps %zmm4, 256(%r14)
  vmovaps %zmm5, 320(%r14)
  vmovaps %zmm6, 384(%r14)
  vmovaps %zmm7, 448(%r14)
  add $128, %rsi
  add %r15, %rdi  /* increment src/dst pointers */
  add %r15, %r14
  cmp %rsi, %r13   /* end location is at half */
  jge avx512_copy_pass_loop
  xor %rsi, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
  lea (%rcx,%r9,4), %r14
  dec %r8                 /* decrement iteration counter */
  jnz avx512_copy_pass_loop
  pop %r13
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  ret

avx512_add:
  push %rsi
  push %rdi
  push %rbx
  push %r15
  push %r14
  mov $512, %r15 /* load in blocks of 512 bytes */
  sub $128, %rdx /* last iteration: rsi == rdx. rsi > rdx = break */
  mov %r9, %rsi  /* assume we're passed in an aligned start location O.o */
  xor %rbx, %rbx
  lea (%rcx,%rsi,4), %rdi
  mov %rdi, %r14
  vmovaps (%rcx), %zmm4
avx512_add_pass_loop:
  vaddps (%rdi), %zmm4, %zmm0
  vaddps 64(%rdi), %zmm4, %zmm1
  vaddps 128(%rdi), %zmm4, %zmm2
  vaddps 192(%rdi), %zmm4, %zmm3
  vmovaps %zmm0, (%rdi)
  vmovaps %zmm1, 64(%rdi)
  vmovaps %zmm2, 128(%rdi)
  vmovaps %zmm3, 192(%rdi)
  vaddps 256(%rdi), %zmm4, %zmm0
  vaddps 320(%rdi), %zmm4, %zmm1
  vaddps 384(%rdi), %zmm4, %zmm2
  vaddps 448(%rdi), %zmm4, %zmm3
  vmovaps %zmm0, 256(%rdi)
  vmovaps %zmm1, 320(%rdi)
  vmovaps %zmm2, 384(%rdi)
  vmovaps %zmm3, 448(%rdi)
  add $128, %rsi
  add %r15, %rdi
  cmp %rsi, %rdx
  jge avx512_add_iteration_count
  mov %rbx, %rsi
  lea (%rcx,%rsi,4), %rdi /* back to start */
avx512_add_iteration_count:

  cmp %rsi, %r9
  jnz avx512_add_pass_loop /* skip iteration decrement if we're not back to start */
  sub $2, %r8
  jg avx512_add_pass_loop
  pop %r14
  pop %r15
  pop %rbx
  pop %rdi
  pop %rsi
  movss (%rcx), %xmm0
  ret

/* rcx = ptr to arr, rdx = nr of fp32 elements in arr, r8 = iteration count */
repmovsb_copy:
  push %r15
  push %r14
  push %r13
  push %r12
  push %rsi
  push %rdi
  cld
  mov %rcx, %rsi  /* set source */
  shr $1, %rdx    /* point destination to second half of array, or rcx + (rdx / 2) */
  mov %rcx, %rdi
  add %rdx, %rdi
  mov %rdx, %rcx  /* rcx = count. set to (size / 2) * (4 bytes per FP32 element) */
  shl $2, %rcx
  mov %rsi, %r12
  mov %rdi, %r13
  mov %rcx, %r14
repmovsb_copy_pass_loop:
  mov %r12, %rsi
  mov %r13, %rdi
  mov %r14, %rcx
  rep movsb
  dec %r8
  jnz repmovsb_copy_pass_loop
  movss (%r12), %xmm0
  pop %rdi
  pop %rsi
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  ret


repmovsd_copy:
  push %r15
  push %r14
  push %r13
  push %r12
  push %rsi
  push %rdi
  cld
  mov %rcx, %rsi  /* set source */
  shr $1, %rdx    /* point destination to second half of array, or rcx + (rdx / 2) */
  mov %rcx, %rdi
  add %rdx, %rdi
  mov %rdx, %rcx  /* rcx = count. set to (size / 2) */
  mov %rsi, %r12
  mov %rdi, %r13
  mov %rcx, %r14
repmovsd_copy_pass_loop:
  mov %r12, %rsi
  mov %r13, %rdi
  mov %r14, %rcx
  rep movsd
  dec %r8
  jnz repmovsd_copy_pass_loop
  movss (%r12), %xmm0
  pop %rdi
  pop %rsi
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  ret 

repstosb_write:
  push %r15
  push %r14
  push %r13
  push %r12
  push %rsi
  push %rdi
  cld
  mov $1, %al     /* set source (1) */
  mov %rcx, %r13  /* save destination into r13 */
  mov %rdx, %r14  /* save count into r14 */
  shl $2, %r14    /* multiply count by 4 because count is in FP32 elements and stosb works with bytes */
repstosb_copy_pass_loop:
  mov %r13, %rdi
  mov %r14, %rcx
  rep stosb
  dec %r8
  jnz repstosb_copy_pass_loop
  movss (%r13), %xmm0
  pop %rdi
  pop %rsi
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  ret  

repstosd_write:
  push %r15
  push %r14
  push %r13
  push %r12
  push %rsi
  push %rdi
  cld
  mov $1, %al     /* set source (1) */
  mov %rcx, %r13  /* save destination into r13 */
  mov %rdx, %r14  /* save count into r14 */
repstosd_copy_pass_loop:
  mov %r13, %rdi
  mov %r14, %rcx
  rep stosl
  dec %r8
  jnz repstosd_copy_pass_loop
  movss (%r13), %xmm0
  pop %rdi
  pop %rsi
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  ret   


/* Tests for cache bank conflicts by reading from two locations, spaced by some
   number of bytes
   rcx = ptr to array. first 32-bit int = increment step, because I'm too lazy to mess with the stack
   rdx = array length, in bytes
   r8 = load spacing, in bytes
   r9 = iter count (number of loads to execute) */
readbankconflict:
   push %rbx
   push %rdi
   push %rsi
   push %r10
   push %r11
   push %r12
   mov $1, %rax
   cmp %r8, %rdx  /* basic check - subtract load spacing from array len */
   jle readbankconflict_end /* exit immediately if we don't have enough space to iterate */
   xor %rax, %rax
   mov %rcx, %rdi
   mov %rcx, %rsi
   mov %rcx, %r12
   add %rdx, %r12  /* set end location */
   sub $10, %r12   /* we're reading 10B ahead */
   add %r8, %rsi   /* rdi = first load location, rsi = second load location */
   mov (%rcx), %rbx  /* rbx = increment */
readbankconflict_loop:
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   mov (%rdi), %r10
   mov (%rsi), %r11
   sub $20, %r9
   jl readbankconflict_end
   cmp %rsi, %r12  /* subtract leading location from end location */
   jg readbankconflict_loop /* if positive or equal, continue loop */
   mov %rcx, %rdi  /* reset to start */
   mov %rcx, %rsi
   add %r8, %rsi
   jmp readbankconflict_loop
readbankconflict_end:
   pop %r12
   pop %r11
   pop %r10
   pop %rsi
   pop %rdi
   pop %rbx
   ret

readbankconflict128:
   push %rbx
   push %rdi
   push %rsi
   push %r10
   push %r11
   push %r12
   mov $1, %rax
   cmp %r8, %rdx  /* basic check - subtract load spacing from array len */
   jle readbankconflict128_end /* exit immediately if we don't have enough space to iterate */
   xor %rax, %rax
   mov %rcx, %rdi
   mov %rcx, %rsi
   mov %rcx, %r12
   add %rdx, %r12  /* set end location */
   sub $10, %r12   /* we're reading 10B ahead */
   add %r8, %rsi   /* rdi = first load location, rsi = second load location */
   mov (%rcx), %rbx  /* rbx = increment */
readbankconflict128_loop:
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   movups (%rdi), %xmm0
   movups (%rsi), %xmm1
   sub $20, %r9
   jl readbankconflict128_end
   cmp %rsi, %r12  /* subtract leading location from end location */
   jg readbankconflict128_loop /* if positive or equal, continue loop */
   mov %rcx, %rdi  /* reset to start */
   mov %rcx, %rsi
   add %r8, %rsi
   jmp readbankconflict128_loop
readbankconflict128_end:
   pop %r12
   pop %r11
   pop %r10
   pop %rsi
   pop %rdi
   pop %rbx
   ret 


================================================
FILE: MemoryBandwidth/MixedMemoryBandwidthTest/MemoryBandwidth.h
================================================
#pragma once

#include <stdint.h>
extern "C" float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);
extern "C" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations);

float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations)
{
    void (*nopfunc)(uint64_t);
    nopfunc = (void(*)(uint64_t))arr;
    int iterIdx;
    for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations);
    return iterIdx;
}

float (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read;

enum InstructionTestType { None, FourByte, EightByte, K8_FourByte, Branch16 };
struct BandwidthTestThreadData {
    uint32_t iterations;
    uint32_t arr_length;
    uint64_t tsc_duration;
    float* arr;
    float (*bw_func)(void*, uint64_t, uint64_t);
};


/// <summary>
/// Automatically picks the best assembly read function supported by the current CPU
/// </summary>
void auto_set_bw_func()
{
    int cpuid_data[4];
    __cpuidex(cpuid_data, 1, 0);
    if (cpuid_data[3] & (1UL << 25)) {
        fprintf(stderr, "SSE supported\n");
        bw_func = sse_asm_read;
    }

    if (cpuid_data[2] & (1UL << 28)) {
        fprintf(stderr, "AVX supported\n");
        bw_func = avx_asm_read;
    }

    __cpuidex(cpuid_data, 7, 0);
    if (cpuid_data[1] & (1UL << 16)) {
        fprintf(stderr, "AVX512 supported\n");
        bw_func = avx512_asm_read;
    }
}

void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum InstructionTestType nopSize)
{
    char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };

    // zen/piledriver optimization manual uses this pattern
    char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 };

    // athlon64 (K8) optimization manual pattern
    char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 };

    uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret
    unsigned char* functionEnd = (unsigned char*)(arr + elements);

    if (nopSize != Branch16) {
        uint64_t* nopPtr;
        if (nopSize == EightByte) nopPtr = (uint64_t*)(nop8b);
        else if (nopSize == FourByte) nopPtr = (uint64_t*)(nop4b);
        else if (nopSize == K8_FourByte) nopPtr = (uint64_t*)(k8_nop4b);
        else {
            fprintf(stderr, "%d (enum value) NOP size isn't supported :(\n", nopSize);
            return;
        }

        for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) {
            arr[nopIdx] = *nopPtr;
        }

        functionEnd[0] = 0xC3;
    }
    else {
        // jump forward 14 bytes
        char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
        char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 };
        uint64_t* branchPtr = (uint64_t*)(branch16b);
        uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this

        // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement
        // i.e. branchElements for loop must be even, so the last iteration is odd
        uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1;
        uint64_t nopIdx;
        for (nopIdx = 0; nopIdx < branchElements; nopIdx++) {
            arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr;
        }

        arr[nopIdx] = *(uint64_t*)ret8b;
    }
}

/// <summary>
/// Given test size in KB, return a good iteration count
/// </summary>
/// <param name="testSize">test size in KB</param>
/// <returns>Iterations per thread</returns>
uint32_t GetIterationCount(uint32_t testSize, uint32_t threads)
{
    uint32_t gbToTransfer = 512;
    if (testSize > 64) gbToTransfer = 512 / 2;
    if (testSize > 512) gbToTransfer = 512 / 4;
    if (testSize > 8192) gbToTransfer = 512 / 8;
    uint32_t iterations = gbToTransfer * 1024 * 1024 / testSize;
    if (iterations % 2 != 0) iterations += 1;

    if (iterations < 8) return 8; // set a minimum to reduce noise
    else return iterations;
}

================================================
FILE: MemoryBandwidth/MixedMemoryBandwidthTest/MemoryBandwidthFunctions.asm
================================================
section .text

bits 64

global sse_asm_read
global sse_asm_copy
global sse_asm_write
global sse_asm_ntwrite
global sse_asm_add
global avx_asm_read
global avx_asm_write
global avx_asm_ntwrite
global avx_asm_copy
global avx_asm_cflip
global avx_asm_add
global avx512_asm_read

global repmovsb_copy
global repstosb_write

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations, r9 = start index
avx_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx_asm_read_pass_loop:
  ; xmm0 to 5 are considered volatile
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  add rsi, 64
  add rdi, r15
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_test_iteration_count:
  cmp r9, rsi
  jnz avx_asm_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_write:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm0, [rcx]
avx_asm_write_pass_loop:
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm0
  vmovaps [rdi + 64], ymm0
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm0
  vmovaps [rdi + 192], ymm0
  vmovaps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm0
  vmovaps [rdi + 64], ymm0
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm0
  vmovaps [rdi + 192], ymm0
  vmovaps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_write_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_write_iteration_count:
  cmp r9, rsi
  jnz avx_asm_write_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_write_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_ntwrite:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm0, [rcx]
avx_asm_ntwrite_pass_loop:
  vmovntps [rdi], ymm0
  vmovntps [rdi + 32], ymm0
  vmovntps [rdi + 64], ymm0
  vmovntps [rdi + 96], ymm0
  vmovntps [rdi + 128], ymm0
  vmovntps [rdi + 160], ymm0
  vmovntps [rdi + 192], ymm0
  vmovntps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  vmovntps [rdi], ymm0
  vmovntps [rdi + 32], ymm0
  vmovntps [rdi + 64], ymm0
  vmovntps [rdi + 96], ymm0
  vmovntps [rdi + 128], ymm0
  vmovntps [rdi + 160], ymm0
  vmovntps [rdi + 192], ymm0
  vmovntps [rdi + 224], ymm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_ntwrite_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_ntwrite_iteration_count:
  cmp r9, rsi
  jnz avx_asm_ntwrite_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx_asm_ntwrite_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

; rcx = ptr to arr
; rdx = arr_length
; r8 = iterations
avx_asm_copy:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  push r13
  xor rsi, rsi
  mov r9, rdx
  shr r9, 1    ; start destination at array + length / 2
  mov r15, 256 ; load in blocks of 128 bytes
  mov r13, r9
  sub r13, 64
  lea rdi, [rcx + rsi * 4]
  lea r14, [rcx + r9 * 4]
avx_copy_pass_loop:
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps ymm4, [rdi + 128]
  vmovaps ymm5, [rdi + 160]
  vmovaps ymm6, [rdi + 192]
  vmovaps ymm7, [rdi + 224]
  vmovaps [r14], ymm0
  vmovaps [r14 + 32], ymm1
  vmovaps [r14 + 64], ymm2
  vmovaps [r14 + 96], ymm3
  vmovaps [r14 + 128], ymm4
  vmovaps [r14 + 160], ymm5
  vmovaps [r14 + 192], ymm6
  vmovaps [r14 + 224], ymm7
  add rsi, 64
  add rdi, r15  ; increment src/dst pointers
  add r14, r15
  cmp r13, rsi  ; end location is at half
  jge avx_copy_pass_loop
  xor rsi, rsi
  lea rdi, [rcx + rsi * 4] ; back to start
  lea r14, [rcx + r9 * 4]
  dec r8                  ; decrement iteration counter
  jnz avx_copy_pass_loop
  pop r13
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; changes the ordering of vector sized elements within a cacheline
 avx_asm_cflip:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break. 128 elements per iteration
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx_asm_cflip_pass_loop:
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 64], ymm1
  vmovaps [rdi + 32], ymm2
  vmovaps [rdi], ymm3
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  vmovaps [rdi + 224], ymm0
  vmovaps [rdi + 192], ymm1
  vmovaps [rdi + 160], ymm2
  vmovaps [rdi + 128], ymm3
  add rsi, 64
  add rdi, r15
  vmovaps ymm0, [rdi]
  vmovaps ymm1, [rdi + 32]
  vmovaps ymm2, [rdi + 64]
  vmovaps ymm3, [rdi + 96]
  vmovaps [rdi + 96], ymm0
  vmovaps [rdi + 64], ymm1
  vmovaps [rdi + 32], ymm2
  vmovaps [rdi], ymm3
  vmovaps ymm0, [rdi + 128]
  vmovaps ymm1, [rdi + 160]
  vmovaps ymm2, [rdi + 192]
  vmovaps ymm3, [rdi + 224]
  vmovaps [rdi + 224], ymm0
  vmovaps [rdi + 192], ymm1
  vmovaps [rdi + 160], ymm2
  vmovaps [rdi + 128], ymm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_cflip_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_cflip_iteration_count:
  cmp r9, rsi
  jnz avx_asm_cflip_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jnz avx_asm_cflip_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx_asm_add:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  vmovaps ymm4, [rdi]
avx_asm_add_pass_loop:
  ; xmm0 to 5 are considered volatile
  vaddps ymm0, ymm4, [rdi]
  vaddps ymm1, ymm4, [rdi + 32]
  vaddps ymm2, ymm4, [rdi + 64]
  vaddps ymm3, ymm4, [rdi + 96]
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm1
  vmovaps [rdi + 64], ymm2
  vmovaps [rdi + 96], ymm3
  vaddps ymm0, ymm4, [rdi + 128]
  vaddps ymm1, ymm4, [rdi + 160]
  vaddps ymm2, ymm4, [rdi + 192]
  vaddps ymm3, ymm4, [rdi + 224]
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm1
  vmovaps [rdi + 192], ymm2
  vmovaps [rdi + 224], ymm3

  add rsi, 64
  add rdi, r15
  vaddps ymm0, ymm4, [rdi]
  vaddps ymm1, ymm4, [rdi + 32]
  vaddps ymm2, ymm4, [rdi + 64]
  vaddps ymm3, ymm4, [rdi + 96]
  vmovaps [rdi], ymm0
  vmovaps [rdi + 32], ymm1
  vmovaps [rdi + 64], ymm2
  vmovaps [rdi + 96], ymm3
  vaddps ymm0, ymm4, [rdi + 128]
  vaddps ymm1, ymm4, [rdi + 160]
  vaddps ymm2, ymm4, [rdi + 192]
  vaddps ymm3, ymm4, [rdi + 224]
  vmovaps [rdi + 128], ymm0
  vmovaps [rdi + 160], ymm1
  vmovaps [rdi + 192], ymm2
  vmovaps [rdi + 224], ymm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx_add_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx_add_iteration_count:
  cmp r9, rsi
  jnz avx_asm_add_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jnz avx_asm_add_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

avx512_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9   ; not doing start anymore, too lazy to clean up code
  ; mov rsi, r9  ; assume we're passed in an aligned start location O.o
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
avx512_asm_read_pass_loop:
  vmovaps zmm0, [rdi]
  vmovaps zmm1, [rdi + 64]
  vmovaps zmm2, [rdi + 128]
  vmovaps zmm3, [rdi + 192]
  add rsi, 64
  add rdi, r15
  vmovaps zmm0, [rdi]
  vmovaps zmm1, [rdi + 64]
  vmovaps zmm2, [rdi + 128]
  vmovaps zmm3, [rdi + 192]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge asm_avx512_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
asm_avx512_test_iteration_count:
  cmp r9, rsi
  jnz avx512_asm_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz avx512_asm_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_read:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
sse_read_pass_loop:
  ; xmm0 to 5 are considered volatile
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm0, [rdi + 64]
  movaps xmm1, [rdi + 80]
  movaps xmm2, [rdi + 96]
  movaps xmm3, [rdi + 112]
  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm0, [rdi + 192]
  movaps xmm2, [rdi + 208]
  movaps xmm2, [rdi + 224]
  movaps xmm2, [rdi + 240]
  add rsi, 64
  add rdi, r15
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm0, [rdi + 64]
  movaps xmm1, [rdi + 80]
  movaps xmm2, [rdi + 96]
  movaps xmm3, [rdi + 112]
  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm0, [rdi + 192]
  movaps xmm2, [rdi + 208]
  movaps xmm2, [rdi + 224]
  movaps xmm2, [rdi + 240]
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_test_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_test_iteration_count:
  cmp r9, rsi
  jnz sse_read_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jnz sse_read_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
sse_asm_write:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm0, [rdi]
sse_write_pass_loop:
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm0
  movaps [rdi + 32], xmm0
  movaps [rdi + 48], xmm0
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm0
  movaps [rdi + 96], xmm0
  movaps [rdi + 112], xmm0
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm0
  movaps [rdi + 160], xmm0
  movaps [rdi + 176], xmm0
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm0
  movaps [rdi + 224], xmm0
  movaps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm0
  movaps [rdi + 32], xmm0
  movaps [rdi + 48], xmm0
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm0
  movaps [rdi + 96], xmm0
  movaps [rdi + 112], xmm0
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm0
  movaps [rdi + 160], xmm0
  movaps [rdi + 176], xmm0
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm0
  movaps [rdi + 224], xmm0
  movaps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_write_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_write_iteration_count:
  cmp r9, rsi
  jnz sse_write_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jg sse_write_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_ntwrite:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm0, [rdi]
sse_ntwrite_pass_loop:
  movntps [rdi], xmm0
  movntps [rdi + 16], xmm0
  movntps [rdi + 32], xmm0
  movntps [rdi + 48], xmm0
  movntps [rdi + 64], xmm0
  movntps [rdi + 80], xmm0
  movntps [rdi + 96], xmm0
  movntps [rdi + 112], xmm0
  movntps [rdi + 128], xmm0
  movntps [rdi + 144], xmm0
  movntps [rdi + 160], xmm0
  movntps [rdi + 176], xmm0
  movntps [rdi + 192], xmm0
  movntps [rdi + 208], xmm0
  movntps [rdi + 224], xmm0
  movntps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  movntps [rdi], xmm0
  movntps [rdi + 16], xmm0
  movntps [rdi + 32], xmm0
  movntps [rdi + 48], xmm0
  movntps [rdi + 64], xmm0
  movntps [rdi + 80], xmm0
  movntps [rdi + 96], xmm0
  movntps [rdi + 112], xmm0
  movntps [rdi + 128], xmm0
  movntps [rdi + 144], xmm0
  movntps [rdi + 160], xmm0
  movntps [rdi + 176], xmm0
  movntps [rdi + 192], xmm0
  movntps [rdi + 208], xmm0
  movntps [rdi + 224], xmm0
  movntps [rdi + 240], xmm0
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_ntwrite_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_ntwrite_iteration_count:
  cmp r9, rsi
  jnz sse_ntwrite_pass_loop ; skip iteration decrement if we're not back to start
  dec r8
  jg sse_ntwrite_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; rcx = ptr to arr
; rdx = arr_length
; r8 = iterations
sse_asm_copy:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  push r13
  xor rsi, rsi
  mov r9, rdx
  shr r9, 1    ; start destination at array + length / 2
  mov r15, 256 ; load in blocks of 128 bytes
  mov r13, r9
  sub r13, 64
  lea rdi, [rcx + rsi * 4]
  lea r14, [rcx + r9 * 4]
sse_copy_pass_loop:
  movaps xmm0, [rdi]
  movaps xmm1, [rdi + 16]
  movaps xmm2, [rdi + 32]
  movaps xmm3, [rdi + 48]
  movaps xmm4, [rdi + 64]
  movaps xmm5, [rdi + 80]
  movaps xmm6, [rdi + 96]
  movaps xmm7, [rdi + 112]
  movaps [r14], xmm0
  movaps [r14 + 16], xmm1
  movaps [r14 + 32], xmm2
  movaps [r14 + 48], xmm3
  movaps [r14 + 64], xmm4
  movaps [r14 + 80], xmm5
  movaps [r14 + 96], xmm6
  movaps [r14 + 112], xmm7

  movaps xmm0, [rdi + 128]
  movaps xmm1, [rdi + 144]
  movaps xmm2, [rdi + 160]
  movaps xmm3, [rdi + 176]
  movaps xmm4, [rdi + 192]
  movaps xmm5, [rdi + 208]
  movaps xmm6, [rdi + 224]
  movaps xmm7, [rdi + 240]
  movaps [r14 + 128], xmm0
  movaps [r14 + 144], xmm1
  movaps [r14 + 160], xmm2
  movaps [r14 + 176], xmm3
  movaps [r14 + 192], xmm4
  movaps [r14 + 208], xmm5
  movaps [r14 + 224], xmm6
  movaps [r14 + 240], xmm7

  add rsi, 64
  add rdi, r15  ; increment src/dst pointers
  add r14, r15
  cmp r13, rsi  ; end location is at half
  jge sse_copy_pass_loop
  xor rsi, rsi
  lea rdi, [rcx + rsi * 4] ; back to start
  lea r14, [rcx + r9 * 4]
  dec r8                  ; decrement iteration counter
  jnz sse_copy_pass_loop
  pop r13
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret

sse_asm_add:
  push rsi
  push rdi
  push rbx
  push r15
  push r14
  mov r15, 256 ; load in blocks of 256 bytes
  sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break
  xor r9, r9
  xor rsi, rsi
  xor rbx, rbx
  lea rdi, [rcx + rsi * 4]
  mov r14, rdi
  movaps xmm5, [rdi]
sse_add_pass_loop:
  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi]
  addps xmm1, [rdi + 16]
  addps xmm2, [rdi + 32]
  addps xmm3, [rdi + 48]
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm1
  movaps [rdi + 32], xmm2
  movaps [rdi + 48], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 64]
  addps xmm1, [rdi + 80]
  addps xmm2, [rdi + 96]
  addps xmm3, [rdi + 112]
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm1
  movaps [rdi + 96], xmm2
  movaps [rdi + 112], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 128]
  addps xmm1, [rdi + 144]
  addps xmm2, [rdi + 160]
  addps xmm3, [rdi + 176]
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm1
  movaps [rdi + 160], xmm2
  movaps [rdi + 176], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 192]
  addps xmm1, [rdi + 208]
  addps xmm2, [rdi + 224]
  addps xmm3, [rdi + 240]
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm1
  movaps [rdi + 224], xmm2
  movaps [rdi + 240], xmm3

  add rsi, 64
  add rdi, r15
  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi]
  addps xmm1, [rdi + 16]
  addps xmm2, [rdi + 32]
  addps xmm3, [rdi + 48]
  movaps [rdi], xmm0
  movaps [rdi + 16], xmm1
  movaps [rdi + 32], xmm2
  movaps [rdi + 48], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 64]
  addps xmm1, [rdi + 80]
  addps xmm2, [rdi + 96]
  addps xmm3, [rdi + 112]
  movaps [rdi + 64], xmm0
  movaps [rdi + 80], xmm1
  movaps [rdi + 96], xmm2
  movaps [rdi + 112], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 128]
  addps xmm1, [rdi + 144]
  addps xmm2, [rdi + 160]
  addps xmm3, [rdi + 176]
  movaps [rdi + 128], xmm0
  movaps [rdi + 144], xmm1
  movaps [rdi + 160], xmm2
  movaps [rdi + 176], xmm3

  movaps xmm0, xmm5
  movaps xmm1, xmm5
  movaps xmm2, xmm5
  movaps xmm3, xmm5
  addps xmm0, [rdi + 192]
  addps xmm1, [rdi + 208]
  addps xmm2, [rdi + 224]
  addps xmm3, [rdi + 240]
  movaps [rdi + 192], xmm0
  movaps [rdi + 208], xmm1
  movaps [rdi + 224], xmm2
  movaps [rdi + 240], xmm3
  add rsi, 64
  add rdi, r15
  cmp rdx, rsi
  jge sse_add_iteration_count
  mov rsi, rbx
  lea rdi, [rcx + rsi * 4]  ; back to start
sse_add_iteration_count:
  cmp r9, rsi
  jnz sse_add_pass_loop ; skip iteration decrement if we're not back to start
  sub r8, 2
  jg sse_add_pass_loop
  pop r14
  pop r15
  pop rbx
  pop rdi
  pop rsi
  ret


; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
repmovsb_copy:
  push r15
  push r14
  push r13
  push r12
  push rsi
  push rdi
  push rax
  cld
  ; source = rsi, destination = rdi, count (in bytes) = rcx
  mov rsi, rcx  ; set source
  shr rdx, 1    ; set destination = source + (size / 2)
  mov rdi, rcx
  add rdi, rdx
  mov rcx, rdx  ; set count = (size / 2) * (4 bytes per fp32 element)
  shl rcx, 2
  mov r12, rsi
  mov r13, rdi
  mov r14, rcx
repmovsb_copy_pass_loop:
  mov rsi, r12
  mov rdi, r13
  mov rcx, r14
  rep movsb
  dec r8
  jnz repmovsb_copy_pass_loop
  movss xmm0, [r12]
  pop rax
  pop rdi
  pop rsi
  pop r12
  pop r13
  pop r14
  pop r15
  ret

; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations
repstosb_write:
  push r15
  push r14
  push r13
  push r12
  push rsi
  push rdi
  push rax
  cld
  ; source = value in al, destination = rdi, count (in bytes) = rcx
  mov al, 1  ; set source
  mov r13, rcx  ; destination = start of arr
  mov r14, rdx  
  shl r14, 2    ; count = (nr of FP32 elements) * 4
repstosb_write_pass_loop:
  mov rdi, r13
  mov rcx, r14
  rep stosb
  dec r8
  jnz repstosb_write_pass_loop
  movss xmm0, [r13]
  pop rax
  pop rdi
  pop rsi
  pop r12
  pop r13
  pop r14
  pop r15
  ret

================================================
FILE: MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.cpp
================================================
// MixedMemoryBandwidthTest.cpp : One-off microbenchmark for hitting L2 with both instruction and data accesses
//

#include <stdio.h>
#include <intrin.h>
#include <immintrin.h>
#include <sys\timeb.h>
#include <math.h>
#include <windows.h>
#include "MemoryBandwidth.h"

int default_test_sizes[39] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 };

double Measure2TBw(uint32_t sizeKb, uint32_t iterations, int shared, enum InstructionTestType instr, double* final_instr_bw, double* final_data_bw);

int main(int argc, char *argv[])
{
    int shared = 0;

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "shared", 6) == 0) {
                shared = 1;
                fprintf(stderr, "Using one array shared across all threads\n");
            }
        }
    }
    auto_set_bw_func();
    double* test_results = (double *)malloc(2 * sizeof(double) * (sizeof(default_test_sizes) / sizeof(int)));
    memset(test_results, 0, 2 * sizeof(double) * (sizeof(default_test_sizes) / sizeof(int)));
    for (int test_size_idx = 0; test_size_idx < sizeof(default_test_sizes) / sizeof(int); test_size_idx++)
    {
        fprintf(stderr, "Testing %d KB\n", default_test_sizes[test_size_idx]);
        Measure2TBw(default_test_sizes[test_size_idx], GetIterationCount(default_test_sizes[test_size_idx], 2), shared, EightByte, &test_results[test_size_idx * 2], &test_results[test_size_idx * 2 + 1]);
    }

    printf("Test Size (KB), Instruction Bandwidth (GB/s), Data Bandwidth (GB/s)\n");
    for (int test_size_idx = 0; test_size_idx < sizeof(default_test_sizes) / sizeof(int); test_size_idx++)
    {
        printf("%d,%f,%f\n", default_test_sizes[test_size_idx], test_results[test_size_idx * 2], test_results[test_size_idx * 2 + 1]);
    }

    free(test_results);
    return 0;
}

DWORD WINAPI ReadBandwidthTestThread(LPVOID param) {
    BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param;
    uint64_t start_tsc = __rdtsc();
    bwTestData->bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations);
    bwTestData->tsc_duration = __rdtsc() - start_tsc;
    return 0;
}

// Use two threads to measure bandwidth and pin them to sibling SMT cores
// One thread measures instruction bandwidth, the other measures data bw
// Auto-adjusts iteration counts to prevent long-tailed behavior where one thread finishes first
double Measure2TBw(uint32_t sizeKb, uint32_t iterations, int shared, enum InstructionTestType instr, double *final_instr_bw, double *final_data_bw) {
    struct timeb start, end;
    float bw = 0;
    uint32_t elements = sizeKb * 1024 / sizeof(float);
    uint32_t private_elements = ceil((double)sizeKb / 2) * 256;
    DWORD protection_flags = PAGE_EXECUTE_READWRITE;
    DWORD tids[2];
    struct BandwidthTestThreadData instrThreadData, dataThreadData;

    if (!shared) elements = private_elements;
    if (!shared && sizeKb < 2) {
        return 0;
    }

    // make array and fill it
    float* sharedTestArr = NULL;
    if (shared) {
        // shared case: both threads read from the same array. it has to contain valid instructions
        sharedTestArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);
        if (sharedTestArr == NULL) {
            fprintf(stderr, "Could not allocate memory\n");
            return 0;
        }

        FillInstructionArray((uint64_t*)sharedTestArr, sizeKb, instr);
        instrThreadData.arr = sharedTestArr;
        dataThreadData.arr = sharedTestArr;
    }
    else {
        // Give threads different arrays
        instrThreadData.arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);
        dataThreadData.arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags);
        FillInstructionArray((uint64_t*)instrThreadData.arr, (elements * 4) / 1024, instr);
        for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) {
            dataThreadData.arr[arr_idx] = arr_idx + 0.5f;
        }
    }

    instrThreadData.arr_length = elements;
    instrThreadData.bw_func = instr_read;
    instrThreadData.iterations = iterations;
    dataThreadData.arr_length = elements;
    dataThreadData.bw_func = bw_func;
    dataThreadData.iterations = iterations;

    while (true) {
        HANDLE instrThread = CreateThread(NULL, 0, ReadBandwidthTestThread, &instrThreadData, CREATE_SUSPENDED, &tids[0]);
        HANDLE dataThread = CreateThread(NULL, 0, ReadBandwidthTestThread, &dataThreadData, CREATE_SUSPENDED, &tids[1]);
    
        // set thread affinity to sibling SMT threads
        ULONGLONG instrMask = 0, dataMask = 0;
        instrMask = (1UL << 2);
        dataMask = (1UL << 3);
        SetThreadAffinityMask(instrThread, instrMask);
        SetThreadAffinityMask(dataThread, dataMask);

        ftime(&start);
        ResumeThread(instrThread);
        ResumeThread(dataThread);
        WaitForSingleObject(dataThread, INFINITE);
        WaitForSingleObject(instrThread, INFINITE);
        ftime(&end);

        int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
        double instrGbTransferred = (uint64_t)instrThreadData.iterations * sizeof(float) * elements / (double)1e9;
        double dataGbTransferred = (uint64_t)dataThreadData.iterations * sizeof(float) * elements / (double)1e9;
        double dataBw = 1000 * instrGbTransferred / (double)time_diff_ms;
        double instrBw = 1000 * dataGbTransferred / (double)time_diff_ms;
        bw = dataBw + instrBw;

        double instr_over_data_ratio = (double)instrThreadData.tsc_duration / (double)dataThreadData.tsc_duration;
        fprintf(stderr, "Instr %f GB/s in %f G ticks, data %f GB/s in %f G ticks, time ratio %f\n", 
            instrBw, instrThreadData.tsc_duration / 1e9, dataBw, dataThreadData.tsc_duration / 1e9, instr_over_data_ratio);
        if (fabs(instr_over_data_ratio - 1.0f) < .1f)
        {
            *final_instr_bw = instrBw;
            *final_data_bw = dataBw;
            break;
        }
        else
        {
            // adjust iteration count on data thread until they finish close enough
            dataThreadData.iterations *= instr_over_data_ratio;
        }
    }

    if (shared) VirtualFree(sharedTestArr, 0, MEM_RELEASE);
    else {
        VirtualFree(instrThreadData.arr, 0, MEM_RELEASE);
        VirtualFree(dataThreadData.arr, 0, MEM_RELEASE);
    }

    return bw;
}

================================================
FILE: MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{5ab9dde0-c954-4d2f-aa46-bfa87ec585c4}</ProjectGuid>
    <RootNamespace>MixedMemoryBandwidthTest</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="MixedMemoryBandwidthTest.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryBandwidthFunctions.asm">
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 MemoryBandwidthFunctions.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Running NASM</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MemoryBandwidthFunctions.obj</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -f win64 MemoryBandwidthFunctions.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Running NASM</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MemoryBandwidthFunctions.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="MemoryBandwidth.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: MemoryBandwidth/MixedMemoryBandwidthTest/MixedMemoryBandwidthTest.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="MixedMemoryBandwidthTest.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryBandwidthFunctions32.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="MemoryBandwidth.h">
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
</Project>

================================================
FILE: MemoryBandwidth/README.md
================================================
# Memory Bandwidth Benchmark
This is a C and assembly project that tests memory bandwidth. There's a version in this directory for Linux that uses POSIX threads for multithreading. There's a Windows version in the MemoryBandwidth subdirectory that uses Windows threading APIs. The Windows version requires Visual Studio and nasm in the path to compile.

To compile the linux version, do `make amd64` or `make aarch64`, depending on the target architecture

# Example usage

Testing single threaded bandwidth: `MemoryBandwidth.exe` or `./membw_amd64` or `./membw_aarch64`

# General parameters
`-threads` - How many threads to spawn. If you spawn more than one (i.e. with `-threads 4`) you might want to specify `-private` or `-shared`

`-private` - A separate test array is allocated for each thread. Each thread will access its own block of data, with the total amount of test data equal to the test size. For example, with a test size of 16 KB and 4 threads, each thread is given a 4 KB array. With this mode, test results will reflect combined cache capacity. If you have four cores, each with a private 32 KB L1D, expect to see L1D bandwidth up to 4 * 32 KB = 128 KB. This is usually the best mode to use because memory bandwidth results won't be inflated by request combining.

`-shared` - A single test array is accessed by all threads. For example, with 4 threads and a 16 KB test size, a single 16 KB array will be allocated and all four threads will hit it. Useful for seeing small shared caches, where the sum of private cache capacity is very close to (or exceeds) shared cache capacity. This mode often gives erroneously high memory bandwidth results because requests to the same cachelines from multiple cores may be combined. Of course using this mode with anything other than read-only access patterns is....stupid.

`-method` - What test to run. Methods will vary depending on what platform you're targeting and what version (Windows or Linux) you're using. There's some naming inconsistency here that I have to clean up. Good luck. If you don't specify it, it should pick the best read-only test function to use on your system. But a few options:
- `asm` (Linux only) - Uses a default read-only test function with a handwritten, unrolled assembly loop. On x86, AVX is used. NEON is used on aarch64.
- `avx512` (Linux, x86-64 only) - Uses AVX-512 instructions
- `write` (Linux) - Tests write bandwidth instead of read bandwidth. Will use AVX-512 if available
- `copy` (Linux) - Copies one half of the array to the other
- `scalar` - Plain C code that should work on any system. Only option available if you're on a weird (not x86 or aarch64) platform. Unsuitable for testing cache bandwidth because compilers are really really bad at autovectorization
- `instr8`, `instr4` - Tests instruction-side bandwidth (as opposed to data side) by filling an array with NOPs and a return at the end, marking it executable, and calling it as if it were a function. On x86-64, `instr8` uses 8 byte NOPs, while `instr4` uses 4 byte NOPs.


================================================
FILE: MemoryLatency/Makefile
================================================
include ../Common/arch_detect.mk

CFLAGS = -O3
LDFLAGS = -lm

all: $(TARGET)

amd64:
	$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_amd64 $(LDFLAGS)

amd64-numa:
	$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency_numa_amd64 $(LDFLAGS) -lnuma

aarch64:
	$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS)

aarch64-numa:
	$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency_aarch64 $(LDFLAGS) -lnuma

riscv64:
	$(CC) $(CFLAGS) MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS)

riscv64-numa:
	$(CC) $(CFLAGS) -DNUMA MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency_riscv64 $(LDFLAGS) -lnuma

w64:
	$(CC) $(CFLAGS) MemoryLatency.cpp MemoryLatency_x86.s -o MemoryLatency_w64.exe $(LDFLAGS)

# w64 can build with mingw 11, which isn't available on jammy

ci: amd64 amd64-numa aarch64 riscv64 w64

clean:
	rm -f *.o && find . -type f -executable -delete

.PHONY: all ci clean


================================================
FILE: MemoryLatency/MemoryLatency.c
================================================
#define _GNU_SOURCE
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <sys/time.h>
#include <unistd.h>

#ifndef __MINGW32__
#include <sys/mman.h>
#endif

#ifdef NUMA
#include <numa.h>
#include <numaif.h>
#include <sys/sysinfo.h>
#endif

#include <errno.h>
#include <sched.h>

// TODO: possibly get this programatically
#define PAGE_SIZE 4096
#define CACHELINE_SIZE 64

int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576 }; //2097152 };

#ifdef __x86_64
extern void preplatencyarr(uint64_t *arr, uint64_t len) __attribute__((ms_abi));
extern uint32_t latencytest(uint64_t iterations, uint64_t *arr) __attribute((ms_abi));

#ifdef __MINGW32__
int posix_memalign(void **memptr, size_t alignment, size_t size)
{
    *memptr = _aligned_malloc(alignment, size);
    return *memptr != NULL;
}
#endif

#define LONGPATTERN 1
extern uint32_t longpatternlatencytest(uint64_t iterations, uint64_t *arr) __attribute((ms_abi));

extern void stlftest(uint64_t iterations, char *arr) __attribute((ms_abi));
extern void matchedstlftest(uint64_t iterations, char *arr) __attribute((ms_abi));
extern void stlftest32(uint64_t iterations, char *arr) __attribute((ms_abi));
extern void stlftest128(uint64_t iterations, char *arr) __attribute((ms_abi));
void (*stlfFunc)(uint64_t, char *) __attribute__((ms_abi)) = stlftest;
#elif __i686
extern void preplatencyarr(uint32_t *arr, uint32_t len) __attribute__((fastcall));
extern uint32_t latencytest(uint32_t iterations, uint32_t *arr) __attribute((fastcall));
extern void stlftest(uint32_t iterations, char *arr) __attribute((fastcall));
extern void matchedstlftest(uint32_t iterations, char *arr) __attribute((fastcall));
void (*stlfFunc)(uint32_t, char *) __attribute__((fastcall)) = stlftest;
#define BITS_32
#elif __aarch64__
extern void preplatencyarr(uint64_t *arr, uint64_t len);
extern uint32_t latencytest(uint64_t iterations, uint64_t *arr);

#define LONGPATTERN 1
extern uint32_t longpatternlatencytest(uint64_t iterations, uint64_t *arr);

extern void matchedstlftest(uint64_t iterations, char *arr);
extern void stlftest(uint64_t iterations, char *arr);
extern void stlftest32(uint64_t iterations, char *arr);
extern void stlftest128(uint64_t iterations, char *arr);
void (*stlfFunc)(uint64_t, char *) = stlftest;
#elif __riscv
extern void preplatencyarr(uint64_t *arr, uint64_t len);
extern uint32_t latencytest(uint64_t iterations, uint64_t *arr);
extern void matchedstlftest(uint64_t iterations, char *arr);
extern void stlftest(uint64_t iterations, char *arr);
extern void stlftest32(uint64_t iterations, char *arr);
extern void stlftest128(uint64_t iterations, char *arr);
void (*stlfFunc)(uint64_t, char *) = stlftest; 
#else
#define UNKNOWN_ARCH 1
extern uint32_t latencytest(uint64_t iterations, uint64_t *arr);
void (*stlfFunc)(uint64_t, char *) = NULL;
#endif

float RunTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);
float RunAsmTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);
float RunTlbTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);
float RunMlpTest(uint32_t size_kb, uint32_t iterations, uint32_t parallelism);
float RunAopTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr);
void RunStlfTest(uint32_t iterations, int mode, int pageEnd, int loadDistance);
void FillPatternArr64(uint64_t *pattern_arr, uint64_t list_size, uint64_t byte_increment);
void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment);

float (*testFunc)(uint32_t, uint32_t, uint32_t *) = RunTest;

uint32_t ITERATIONS = 100000000;
uint32_t pageByPage = 0;
uint32_t longpattern = 0;

int main(int argc, char* argv[]) {
    uint32_t maxTestSizeMb = 0;
    uint32_t singleSize = 0;
    uint32_t testSizeCount = sizeof(default_test_sizes) / sizeof(int);
    int mlpTest = 0;  // if > 0, run MLP test with (value) levels of parallelism max
    int stlf = 0, hugePages = 0;
    int stlfPageEnd = 0, numa = 0, stlfLoadDistance = 0;
    uint32_t *hugePagesArr = NULL;
    size_t hugePagesAllocatedBytes = 0;
    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char *arg = argv[argIdx] + 1;
            if (strncmp(arg, "test", 4) == 0) {
                argIdx++;
                char *testType = argv[argIdx];

        if (strncmp(testType, "c", 1) == 0) {
                    testFunc = RunTest;
                    fprintf(stderr, "Using simple C test\n");
                } else if (strncmp(testType, "tlb", 3) == 0) {
                    testFunc = RunTlbTest;
                    fprintf(stderr, "Testing TLB with one element accessed per 4K page\n");
                } else if (strncmp(testType, "mlp", 3) == 0) {
                    mlpTest = 32;
                    fprintf(stderr, "Running memory parallelism test\n");
                } else if (strncmp(testType, "aop", 3) == 0) {
                    testFunc = RunAopTest;
                    fprintf(stderr, "Running array-of-pointers test\n");
                }
                #ifndef UNKNOWN_ARCH
                else if (strncmp(testType, "asm", 3) == 0) {
                    testFunc = RunAsmTest;
                    fprintf(stderr, "Using ASM (simple address) test\n");
                } else if (strncmp(testType, "stlf", 4) == 0) {
                    stlf = 1;
                    fprintf(stderr, "Running store to load forwarding test\n");
                } else if (strncmp(testType, "matched_stlf", 4) == 0) {
                    stlf = 1;
                    stlfFunc = matchedstlftest;
                    fprintf(stderr, "Running store to load forwarding test, with matched load/store sizes\n");
                } else if (strncmp(testType, "128_stlf", 4) == 0) {
                    stlf = 1;
                    stlfFunc = stlftest128;
                    fprintf(stderr, "Running store to load forwarding test, with 128-bit store, 64-bit load\n");
                } 
                #ifdef LONGPATTERN
                else if (strncmp(testType, "longpattern", 11) == 0) {
                    testFunc = RunAsmTest;
                    longpattern = 1;
                    fprintf(stderr, "Using ASM (simple address) test with longer pattern\n");
                }
                #endif
                #ifndef BITS_32
                else if (strncmp(testType, "dword_stlf", 9) == 0) {
                    stlf = 2;
                    stlfFunc = stlftest32;
                    fprintf(stderr, "Running store to load forwarding test, with 32-bit stores\n");
                }
                #endif
                #endif  // end UNKNOWN_ARCH
                else {
                    fprintf(stderr, "Unrecognized test type: %s\n", testType);
                    fprintf(stderr, "Valid test types: c, tlb, mlp");
            #ifndef UNKNOWN_ARCH
            fprintf(stderr, ", asm, stlf, matched_stlf, dword_stlf");
            #endif
            fprintf(stderr, "\n");
                }
            } else if (strncmp(arg, "maxsizemb", 9) == 0) {
                argIdx++;
                maxTestSizeMb = atoi(argv[argIdx]);
                fprintf(stderr, "Will not exceed %u MB\n", maxTestSizeMb);
            } else if (strncmp(arg, "iter", 4) == 0) {
                argIdx++;
                ITERATIONS = atoi(argv[argIdx]);
                fprintf(stderr, "Base iterations: %u\n", ITERATIONS);
            } 
            else if (strncmp(arg, "stlf_page_end", 13) == 0) {
                    argIdx++;
                    stlfPageEnd = atoi(argv[argIdx]);
                    fprintf(stderr, "Store to load forwarding test will be pushed to end of %d byte page\n", stlfPageEnd);
            }
            else if (strncmp(arg, "stlf_load_offset", 16) == 0) {
                    argIdx++;
                    stlfLoadDistance = atoi(argv[argIdx]);
                    fprintf(stderr, "Loads will be offset by %d bytes\n", stlfLoadDistance);
            }
            #ifndef __MINGW32__
            else if (strncmp(arg, "hugepages", 9) == 0) {
                  hugePages = 1;
                  fprintf(stderr, "If applicable, will use huge pages. Will allocate max memory at start, make sure system has enough memory.\n");
            } 
	    else if (strncmp(arg, "affinity", 8) == 0) {
                argIdx++;
		int targetThread = atoi(argv[argIdx]);
                fprintf(stderr, "Affinity set to core %d\n", targetThread);
                cpu_set_t cpuset;
                CPU_ZERO(&cpuset);
                CPU_SET(targetThread, &cpuset);
                sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
	    }
            #endif
            else if (strncmp(arg, "pagebypage", 10) == 0) {
                pageByPage = 1;
                fprintf(stderr, "If applicable, will hit all elements in a page before moving to another page to reduce TLB penalties\n");
            }
            else if (strncmp(arg, "sizekb", 6) == 0) {
                argIdx++;
                singleSize = atoi(argv[argIdx]);
                fprintf(stderr, "Testing %u KB only\n", singleSize);
            }

#ifdef NUMA
            else if (strncmp(arg, "numa", 4) == 0) {
                numa = 1;
                singleSize = 1048576;
                fprintf(stderr, "Testing node to node latency. If test size is not set, it will be 1 GB\n");
            }
#endif
        else {
                fprintf(stderr, "Unrecognized option: %s\n", arg);
            }
        }
    }

    if (argc == 1) {
        fprintf(stderr, "Usage: [-test <c/asm/tlb/mlp>] [-maxsizemb <max test size in MB>] [-iter <base iterations, default 100000000]\n");
    }

#ifdef __linux__
    if (hugePages) {
       size_t hugePageSize = 1 << 21;
       size_t testSizeKb = singleSize ? singleSize : default_test_sizes[testSizeCount - 1];
       size_t maxMemRequired = testSizeKb * (size_t)1024;
       hugePagesAllocatedBytes = maxMemRequired;
       if (maxTestSizeMb > 0 && maxMemRequired > maxTestSizeMb * 1024 * 1024) maxMemRequired = maxTestSizeMb * 1024 * 1024;
       maxMemRequired = (((maxMemRequired - 1) / hugePageSize) + 1) * hugePageSize;
       fprintf(stderr, "mmap-ing %lu bytes\n", maxMemRequired);
       hugePagesArr = mmap(NULL, maxMemRequired, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
       if (hugePagesArr == (void *)-1) { // on failure, mmap will return MAP_FAILED, or (void *)-1
           fprintf(stderr, "Failed to mmap huge pages, errno %d = %s\nWill try to use madvise\n", errno, strerror(errno));
           if (0 != posix_memalign((void **)(&hugePagesArr), 2 * 1024 * 1024, maxMemRequired)) {
               fprintf(stderr, "Failed to allocate 2 MB aligned memory, will not use hugepages\n");
           hugePagesArr = NULL;
               return 0;
           }

           madvise(hugePagesArr, maxMemRequired, MADV_HUGEPAGE);
       }
    }
#endif

    if (mlpTest) {
        // allocate arr to hold results
        float *results = (float *)malloc(testSizeCount * mlpTest * sizeof(float));
        for (int size_idx = 0; size_idx < testSizeCount; size_idx++) {
            for (int parallelism = 0; parallelism < mlpTest; parallelism++) {
                results[size_idx * mlpTest + parallelism] = RunMlpTest(default_test_sizes[size_idx], ITERATIONS, parallelism + 1);
                printf("%d KB, %dx parallelism, %f MB/s\n", default_test_sizes[size_idx], parallelism + 1, results[size_idx * mlpTest + parallelism]);
            }
        }

        for (int size_idx = 0; size_idx < testSizeCount; size_idx++) {
            printf(",%d", default_test_sizes[size_idx]);
        }

        printf("\n");

        for (int parallelism = 0; parallelism < mlpTest; parallelism++) {
            printf("%d", parallelism + 1);
            for (int size_idx = 0; size_idx < default_test_sizes[size_idx]; size_idx++) {
                printf(",%f", results[size_idx * mlpTest + parallelism]);
            }
            printf("\n");
        }

        free(results);
    } else if (stlf) {
        RunStlfTest(ITERATIONS, stlf, stlfPageEnd, stlfLoadDistance);
    } 
#ifdef NUMA
    else if (numa) {
        if (numa_available() == -1) {
        fprintf(stderr, "NUMA is not available\n");
        return 0;
    }

    int numaNodeCount = numa_max_node() + 1;
    if (numaNodeCount > 64) {
        fprintf(stderr, "Too many NUMA nodes. Go home.\n");
        return 0;
    }

    struct bitmask *nodeBitmask = numa_allocate_cpumask();
    float *crossnodeLatencies = (float *)malloc(sizeof(float) * numaNodeCount * numaNodeCount);
    memset(crossnodeLatencies, 0, sizeof(float) * numaNodeCount * numaNodeCount);
        for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {
        numa_node_to_cpus(cpuNode, nodeBitmask);
        int nodeCpuCount = numa_bitmask_weight(nodeBitmask);
        if (nodeCpuCount == 0) {
            fprintf(stderr, "Node %d has no cores\n", cpuNode);
        continue;
        }

            fprintf(stderr, "Node %d has %d cores\n", cpuNode, nodeCpuCount);
        cpu_set_t cpuset;
        memcpy(cpuset.__bits, nodeBitmask->maskp, nodeBitmask->size / 8);
            // for (int i = 0; i < get_nprocs(); i++) 
            //  if (numa_bitmask_isbitset(nodeBitmask, i)) CPU_SET(i, &cpuset); 

        sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);

        for (int memNode = 0; memNode < numaNodeCount; memNode++) {
            uint64_t nodeMask = 1UL << memNode;
        uint32_t *arr;
            if (hugePagesArr) {
            fprintf(stderr, "mbind-ing pre-allocated arr, size %lu bytes\n", hugePagesAllocatedBytes);
            long mbind_rc = mbind(hugePagesArr, hugePagesAllocatedBytes, MPOL_BIND, &nodeMask, 64, MPOL_MF_STRICT | MPOL_MF_MOVE);
            fprintf(stderr, "mbind returned %ld\n", mbind_rc);
            if (mbind_rc != 0) {
                fprintf(stderr, "errno: %d\n", errno);
            }
            arr = hugePagesArr;
        } else {
                    arr = numa_alloc_onnode(singleSize * 1024, memNode);
                    madvise(arr, singleSize * 1024, MADV_HUGEPAGE);
        }
            
        float latency = testFunc(singleSize, ITERATIONS, arr);
        crossnodeLatencies[cpuNode * numaNodeCount + memNode] = latency;
        fprintf(stderr, "CPU node %d -> mem node %d: %f ns\n", cpuNode, memNode, latency);
        if (!hugePages) numa_free(arr, singleSize * 1024);
        }
    }

    for (int memNode = 0; memNode < numaNodeCount; memNode++) {
        printf(",%d", memNode);
    }

    printf("\n");
    for (int cpuNode = 0; cpuNode < numaNodeCount; cpuNode++) {
        printf("%d", cpuNode);
        for (int memNode = 0; memNode < numaNodeCount; memNode++) {
            printf(",%f", crossnodeLatencies[cpuNode * numaNodeCount + memNode]);
        }

        printf("\n");
    }

    free(crossnodeLatencies);
    }
#endif
    else {
        if (singleSize == 0) {
        printf("Region,Latency (ns)\n");
            for (int i = 0; i < testSizeCount; i++) {
                if ((maxTestSizeMb == 0) || (default_test_sizes[i] <= maxTestSizeMb * 1024))
                    printf("%d,%f\n", default_test_sizes[i], testFunc(default_test_sizes[i], ITERATIONS, hugePagesArr));
                else {
                    fprintf(stderr, "Test size %u KB exceeds max test size of %u KB\n", default_test_sizes[i], maxTestSizeMb * 1024);
                    break;
                }
            }
        } else {
            printf("%d,%f\n", singleSize, testFunc(singleSize, ITERATIONS, hugePagesArr));
        }
    }

    return 0;
}

/// <summary>
/// Heuristic to make sure test runs for enough time but not too long
/// </summary>
/// <param name="size_kb">Region size</param>
/// <param name="iterations">base iterations</param>
/// <returns>scaled iterations</returns>
uint64_t scale_iterations(uint32_t size_kb, uint32_t iterations) {
    return 10 * iterations / pow(size_kb, 1.0 / 4.0);
}

// Fills an array so that traversal completes within one page before going to another
// random page. Tries to avoid TLB penalties at the cost of not being completely random
// list_size = size of pattern arr in 32-bit elements
void FillPageByPage(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t pageCount = list_size * sizeof(uint32_t) / PAGE_SIZE;
    uint32_t page_element_count = PAGE_SIZE / sizeof(uint32_t);
    if (pageCount <= 2) {
        FillPatternArr(pattern_arr, list_size, byte_increment);
        return;
    }

    // If test size is not divisible by page size, handle the extra page separately
    short extraPage = 0;
    if (pageCount * PAGE_SIZE / sizeof(uint32_t) < list_size) extraPage = 1;

    uint32_t *pagePatternArr = malloc(sizeof(uint32_t) * (pageCount + extraPage));
    FillPatternArr(pagePatternArr, pageCount + extraPage, 4);
    for (uint32_t page_idx = 0; page_idx < pageCount; page_idx++)
    {
        uint32_t *page_base = pattern_arr + (page_element_count * page_idx);
        FillPatternArr(page_base, page_element_count, byte_increment);

        uint32_t page_last_element_index;
        for (uint32_t page_element_idx = 0; page_element_idx < (PAGE_SIZE / sizeof(uint32_t)); page_element_idx += (byte_increment / sizeof(uint32_t))) {
            // element that points to 0 should be directed to the next page
            if (page_base[page_element_idx] == 0) page_base[page_element_idx] = pagePatternArr[page_idx] * (PAGE_SIZE / sizeof(uint32_t));

            // otherwise make sure the offset is set relative to the start of the uber-array
            else page_base[page_element_idx] += page_element_count * page_idx;
        }
    }

    free(pagePatternArr);
    return;
}

// Fills an array so that traversal completes within one page before going to another
// random page. Tries to avoid TLB penalties at the cost of not being completely random
// list_size = size of pattern arr in 32-bit elements
void FillPageByPage64(uint64_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t pageCount = list_size * sizeof(uint64_t) / PAGE_SIZE;
    uint32_t page_element_count = PAGE_SIZE / sizeof(uint64_t);
    if (pageCount <= 2) {
        FillPatternArr64(pattern_arr, list_size, byte_increment);
        return;
    }

    // If test size is not divisible by page size, handle the extra page separately
    short extraPage = 0;
    if (pageCount * PAGE_SIZE / sizeof(uint64_t) < list_size) extraPage = 1;

    uint32_t *pagePatternArr = malloc(sizeof(uint32_t) * (pageCount + extraPage));
    FillPatternArr(pagePatternArr, pageCount + extraPage, 4);
    for (uint32_t page_idx = 0; page_idx < pageCount; page_idx++)
    {
        uint64_t *page_base = pattern_arr + (page_element_count * page_idx);
        FillPatternArr((uint32_t *)page_base, page_element_count, byte_increment);

        uint32_t page_last_element_index;
        for (uint32_t page_element_idx = 0; page_element_idx < (PAGE_SIZE / sizeof(uint64_t)); page_element_idx += (byte_increment / sizeof(uint64_t))) {
            // element that points to 0 should be directed to the next page
            if (page_base[page_element_idx] == 0) page_base[page_element_idx] = pagePatternArr[page_idx] * (PAGE_SIZE / sizeof(uint64_t));

            // otherwise make sure the offset is set relative to the start of the uber-array
            else page_base[page_element_idx] += page_element_count * page_idx;
        }
    }

    free(pagePatternArr);
    return;
}

// Fills an array using Sattolo's algo
// pattern_arr = array to fill
// list_size = size of pattern arr in 32-bit elements
// byte_increment = one element per this many bytes
void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

// Same thing but with 64-bit elements
// pattern_arr = array to fill
// list_size = number of 64-bit elements in array
// byte_increment = cacheline size, in bytes
void FillPatternArr64(uint64_t *pattern_arr, uint64_t list_size, uint64_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint64_t); // number of 64-bit integers in a cacheline
    uint32_t element_count = list_size / increment;
    for (int increment_offset = 0; increment_offset < increment; increment_offset++) {
        for (int i = 0; i < element_count; i++) {
            pattern_arr[i * increment + increment_offset] = i * increment + increment_offset;
        }

        int iter = element_count;
        while (iter > 1) {
            iter -= 1;
            int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
            uint64_t tmp = pattern_arr[iter * increment + increment_offset];
            pattern_arr[iter * increment + increment_offset] = pattern_arr[j * increment + increment_offset];
            pattern_arr[j * increment + increment_offset] = tmp;
        }
    }
}

float RunTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t list_size = size_kb * 1024 / 4;
    uint32_t sum = 0, current;

    // Fill list to create random access pattern
    uint32_t *A;
    if (preallocatedArr == NULL) {
        if (0 != posix_memalign((void **)(&A), 64, sizeof(uint32_t) * list_size)) {
            fprintf(stderr, "Failed to allocate memory for %u KB test\n", size_kb);
        }
    } else {
        A = (uint32_t *)preallocatedArr;
    }

    if (!pageByPage) FillPatternArr(A, list_size, CACHELINE_SIZE);
    else FillPageByPage(A, list_size, CACHELINE_SIZE);

    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);

    // Run test
    gettimeofday(&startTv, &startTz);
    current = A[0];
    for (int i = 0; i < scaled_iterations; i++) {
        current = A[current];
        sum += current;
    }
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (preallocatedArr == NULL) free(A);

    if (sum == 0) printf("sum == 0 (?)\n");
    return latency;
}

// Test array of pointers
float RunAopTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t element_count = size_kb * 1024 / 64;  // 64B cachelines
    uint32_t sum = 0, current;

    // allocate pattern array
    uint32_t *pattern_arr = (uint32_t *)malloc(element_count * sizeof(uint32_t));
    uint32_t **pointer_arr = (uint32_t **)malloc(element_count * sizeof(uint32_t *));

    for (int i = 0; i < element_count; i++) pattern_arr[i] = i;
    
    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter];
        pattern_arr[iter] = pattern_arr[j];
        pattern_arr[j] = tmp;
    }

    uint32_t *A;
    if (preallocatedArr == NULL) {
        if (0 != posix_memalign((void **)(&A), 64, 1024 * size_kb)) {
            fprintf(stderr, "Failed to allocate memory for %u KB test\n", size_kb);
        }
    } else {
        A = (uint32_t *)preallocatedArr;
    }

    // make pattern array actually pointers
    for (int i = 0; i < element_count; i++) {
        pointer_arr[i] = A + (pattern_arr[i] * (64 / sizeof(uint32_t)));
        *pointer_arr[i] = i + 1;
    }
    free(pattern_arr); 

    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);
    gettimeofday(&startTv, &startTz);
    for (int i = 0; i < scaled_iterations;) {
        for (int pointer_idx = 0; (pointer_idx < element_count) && (i < scaled_iterations); pointer_idx++, i++)
            sum += *pointer_arr[pointer_idx]; 
    }
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (sum == 0) fprintf(stderr, "something is not right\n");
    if (preallocatedArr == NULL) free(A);

    free(pointer_arr);
    return latency;
}

// Tests memory level parallelism. Returns achieved BW in MB/s using specified number of
// independent pointer chasing chains
float RunMlpTest(uint32_t size_kb, uint32_t iterations, uint32_t parallelism) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t list_size = size_kb * 1024 / 4;
    uint32_t sum = 0, current;

    if (parallelism < 1) return 0;

    // Fill list to create random access pattern, and hold temporary data
    uint32_t *A = (uint32_t *)malloc(sizeof(uint32_t) * list_size);
    uint32_t *offsets = (uint32_t *)malloc(sizeof(uint32_t) * parallelism);
    if (!A || !offsets) {
        fprintf(stderr, "Failed to allocate memory for %u KB test\n", size_kb);
        return 0;
    }

    FillPatternArr(A, list_size, CACHELINE_SIZE);
    for (int i = 0; i < parallelism; i++) offsets[i] = i * (CACHELINE_SIZE / sizeof(uint32_t));
    uint32_t scaled_iterations = scale_iterations(size_kb, iterations) / parallelism;

    // Run test
    gettimeofday(&startTv, &startTz);
    for (uint32_t i = 0; i < scaled_iterations; i++) {
        for (uint32_t j = 0; j < parallelism; j++)
        {
            offsets[j] = A[offsets[j]];
        }
    }
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    double mbTransferred = (scaled_iterations * parallelism * sizeof(uint32_t))  / (double)1e6;
    float bw = 1000 * mbTransferred / (double)time_diff_ms;

    sum = 0;
    for (uint32_t i = 0; i < parallelism; i++) sum += offsets[i];
    if (sum == 0) printf("sum == 0 (?)\n");

    free(A);
    free (offsets);
    return bw;
}

#ifdef __i686
#define POINTER_SIZE 4
#define POINTER_INT uint32_t
#else
#define POINTER_SIZE 8
#define POINTER_INT uint64_t
#endif

#ifndef UNKNOWN_ARCH
float RunAsmTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint64_t list_size = size_kb * 1024 / POINTER_SIZE; // using 32-bit pointers
    uint32_t sum = 0, current;

    // Fill list to create random access pattern
    POINTER_INT *A;
    if (preallocatedArr == NULL) {
        if (0 != posix_memalign((void **)(&A), 64, POINTER_SIZE * list_size)) {
            fprintf(stderr, "Failed to allocate memory for %u KB test\n", size_kb);
        }
    } else {
        A = (POINTER_INT *)preallocatedArr;
    }

    memset(A, 0, POINTER_SIZE * list_size);

#ifdef __i686
    if (!pageByPage) FillPatternArr(A, list_size, CACHELINE_SIZE);
    else FillPageByPage(A, list_size, CACHELINE_SIZE);
#else
    if (!pageByPage) FillPatternArr64(A, list_size, CACHELINE_SIZE);
    else FillPageByPage64(A, list_size, CACHELINE_SIZE);
#endif

    preplatencyarr(A, list_size);

    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);

    // Run test
    gettimeofday(&startTv, &startTz);
    #ifdef LONGPATTERN
    if (longpattern)
        sum = longpatternlatencytest(scaled_iterations, A);
    else
        sum = latencytest(scaled_iterations, A);
    #endif
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (preallocatedArr == NULL) free(A);

    // if (sum == 0) printf("sum == 0 (?)\n");
    return latency;
}
#endif

// Tries to isolate virtual to physical address translation latency by accessing
// one element per page, and checking latency difference between that and hitting the same amount of "hot"
// cachelines using a normal latency test.. 4 KB pages are assumed.
float RunTlbTest(uint32_t size_kb, uint32_t iterations, uint32_t *preallocatedArr) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint32_t element_count = size_kb / 4;
    uint32_t list_size = size_kb * 1024 / 4;
    uint32_t sum = 0, current;

    if (element_count == 0) element_count = 1;

    //fprintf(stderr, "Element count for size %u: %u\n", size_kb, element_count);

    // create access pattern first, then fill it into the test array spaced by page size
    uint32_t *pattern_arr = (uint32_t*)malloc(sizeof(uint32_t) * element_count);
    if (!pattern_arr) {
        fprintf(stderr, "Failed to allocate memory for %u KB test (offset array)\n", size_kb);
        return 0;
    }

    for (int i = 0; i < element_count; i++) {
        pattern_arr[i] = i;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter];
        pattern_arr[iter] = pattern_arr[j];
        pattern_arr[j] = tmp;
    }

    // translate offsets and fill the test array
    // [offset-------page-------][offset-----page------....etc
    uint32_t *A;
    if (preallocatedArr == NULL) {
        A = (uint32_t *)malloc(sizeof(uint32_t) * list_size);
        if (!A) {
            fprintf(stderr, "Failed to allocate memory for %u KB test (pointer array)\n", size_kb);
        }
    } else {
        A = preallocatedArr;
    }

    memset(A, INT_MAX, list_size); // catch any bad accesses immediately
    int pageIncrement = PAGE_SIZE / sizeof(uint32_t);
    for (int i = 0;i < element_count; i++) {
        // offset each by i cachelines to avoid conflict misses. If we just use the first cacheline
        // in each page, the index bits for every VIPT access will be the same and we'll run into L1D misses
        // faster than we would like
        int idx = i * pageIncrement + ((i * 16) & (pageIncrement - 1));
        int target_idx = pattern_arr[i] * pageIncrement + ((pattern_arr[i] * 16) & (pageIncrement - 1));
        A[idx] = target_idx;
    }

    free(pattern_arr);  // don't need this anymore

    uint32_t scaled_iterations = scale_iterations(size_kb, iterations);

    // Run test
    gettimeofday(&startTv, &startTz);
    current = A[0];
    for (int i = 0; i < scaled_iterations; i++) {
        current = A[current];
        sum += current;
        //if (size_kb == 48) fprintf(stderr, "idx: %u\n", current);
    }
    gettimeofday(&endTv, &endTz);
    uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (preallocatedArr == NULL) free(A);

    if (element_count > 1 && sum == 0) printf("sum == 0 (?)\n");

    // Get a reference timing for the size, to isolate TLB latency from cache latency
    uint32_t memoryUsedKb = (element_count * CACHELINE_SIZE) / 1024;
    if (memoryUsedKb == 0) memoryUsedKb = 1;
    float cacheLatency = RunTest(memoryUsedKb, iterations, preallocatedArr);

    //fprintf(stderr, "Memory used - %u KB, latency: %f, ref latency: %f\n", memoryUsedKb, latency, cacheLatency);
    return latency - cacheLatency;
}

// Run store to load forwarding test, as described in https://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/
// uses 4B loads and 8B stores to see when/if store forwarding can succeed when sizes are not matched
// pageEnd = push test to the end of (pageEnd) sized page. 0 = just test cacheline
// loadDistance = how far ahead to push the load (for testing aliasing)
// cannot set both pageEnd and loadDistance
void RunStlfTest(uint32_t iterations, int mode, int pageEnd, int loadDistance) {
    struct timeval startTv, endTv;
    struct timezone startTz, endTz;
    uint64_t time_diff_ms;
    float latency;
    float stlfResults[64][64];
    char *arr; 
    char *allocArr;

    // defaults: grab a couple of cachelines
    int testAlignment = 64, testAllocSize = 128, testOffset = 0;

    if (pageEnd != 0) {
        testAlignment = pageEnd;
        testAllocSize = pageEnd * 2;
        testOffset = pageEnd - 64;
    } else if (loadDistance != 0) {
        testAlignment = 4096;
        testAllocSize = loadDistance + 128; // enough if I ever go to avx-512 loads
    }

    // obtain a couple of cachelines, assuming 64B cacheline size
#ifdef _WIN32
    allocArr = (char *)_aligned_malloc(testAllocSize, testAlignment);
    if (allocArr == NULL) {
        fprintf(stderr, "Could not obtain aligned memory\n");
        return;
    }
#else
    if (0 != posix_memalign((void **)(&allocArr), testAlignment, testAllocSize)) {
        fprintf(stderr, "Could not obtain aligned memory\n");
        return;
    }
#endif

    arr = allocArr + testOffset;

    for (int storeOffset = 0; storeOffset < 64; storeOffset++)
        for (int loadOffset = 0; loadOffset < 64; loadOffset++) {
            ((uint32_t *)(arr))[0] = storeOffset;
            ((uint32_t *)(arr))[1] = loadOffset + loadDistance;
            gettimeofday(&startTv, &startTz);
            stlfFunc(iterations, arr);
            gettimeofday(&endTv, &endTz);
            time_diff_ms = 1e6 * (endTv.tv_sec - startTv.tv_sec) + (endTv.tv_usec - startTv.tv_usec);
            latency = 1e3 * (float) time_diff_ms / (float) iterations;
            stlfResults[storeOffset][loadOffset] = latency;
            fprintf(stderr, "Store offset %d, load offset %d: %f ns\n", storeOffset, loadOffset, latency);
        }

    // output as CSV
    for (int loadOffset = 0; loadOffset < 64; loadOffset++) printf(",%d", loadOffset);
    printf("\n");
    for (int storeOffset = 0; storeOffset < 64; storeOffset++) {
        printf("%d", storeOffset);
        for (int loadOffset = 0; loadOffset < 64; loadOffset++) {
            printf(",%f", stlfResults[storeOffset][loadOffset]);
        }
        printf("\n");
    }
#ifdef _WIN32
    _aligned_free(allocArr);
#else
    free(allocArr);
#endif
    return;
}


================================================
FILE: MemoryLatency/MemoryLatency.cpp
================================================
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#ifdef __MINGW32__
    #include <sys/timeb.h>
#else
    #include <sys\timeb.h>
#endif
#include <math.h>
#include <windows.h>
#include <tchar.h>
#include <intrin.h>

#define ITERATIONS 400000000

int default_test_sizes[36] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048,
                               3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304,
                               131072, 262144, 393216, 524288, 1048576 };

float RunTest(uint32_t size_kb, uint64_t iterations, void *mem);
float RunAsmTest(uint32_t size_kb, uint64_t iterations, void* mem);
bool GetPrivilege();

extern "C" void preplatencyarr(uint64_t * mem, uint64_t element_count);
extern "C" uint64_t latencytest(uint64_t iterations, uint64_t *mem);

int main(int argc, char* argv[]) {
    void* arr = NULL;
    int numa = 0, coreNode = 0, memNode = 0, largepages = 0;
    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "hugepages", 9) == 0) {
                fprintf(stderr, "Will attempt to use large pages\n");
                largepages = 1;
                GetPrivilege();
            } else if (_strnicmp(arg, "autonuma", 8) == 0) {
                fprintf(stderr, "Testing NUMA, 1 GB test size\n");
                numa = 1;
            }
            else if (_strnicmp(arg, "numa", 4) == 0) {
                numa = 2;
                argIdx++;
                coreNode = atoi(argv[argIdx]);
                argIdx++;
                memNode = atoi(argv[argIdx]);
                fprintf(stderr, "Testing %d -> %d\n", coreNode, memNode);
            }
        }
    }

    DWORD allocationType = MEM_RESERVE | MEM_COMMIT;
    if (largepages) allocationType |= MEM_LARGE_PAGES;

    if (numa == 1) {
        ULONG highestNumaNode;
        DWORD nProcs;
        SYSTEM_INFO SystemInfo;
        GetSystemInfo(&SystemInfo);
        nProcs = SystemInfo.dwNumberOfProcessors;
        if (!GetNumaHighestNodeNumber(&highestNumaNode)) {
            fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError());
            return 0;
        }

        // auto numa latency mode - use highest test size and test latency from core node to mem node
        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) printf(",%d", coreNode);
        printf("\n");

        for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) {
            printf("%d", coreNode);
            for (int memNode = 0; memNode <= highestNumaNode; memNode++) {
                ULONGLONG mask;
                DWORD index;
                arr = VirtualAllocExNuma(GetCurrentProcess(),
                    NULL,
                    default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024,
                    allocationType,
                    PAGE_READWRITE,
                    memNode);
                GetNumaNodeProcessorMask(coreNode, &mask);
                BitScanReverse64(&index, mask);
                mask = 0;
                mask |= 1ULL << (ULONGLONG)index;
                SetProcessAffinityMask(GetCurrentProcess(), mask);
                float latency = RunAsmTest(1048576, ITERATIONS, arr);
                printf(",%f", latency);
                VirtualFree(arr, 0, MEM_RELEASE);
            }

            printf("\n");
        }
    } else {
        if (numa == 2) {
            ULONG highestNumaNode;
            ULONGLONG mask;
            DWORD nProcs, index;
            SYSTEM_INFO SystemInfo;
            GetSystemInfo(&SystemInfo);
            nProcs = SystemInfo.dwNumberOfProcessors;

            GetNumaNodeProcessorMask(coreNode, &mask);
            fprintf(stderr, "node core mask: %llx\n", mask);
            BitScanReverse64(&index, mask);
            mask = 0;
            mask |= 1ULL << (ULONGLONG)index;
            SetProcessAffinityMask(GetCurrentProcess(), mask);
            fprintf(stderr, "core mask: %llx, index %u\n", mask, index);
            arr = VirtualAllocExNuma(GetCurrentProcess(),
                NULL,
                default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024,
                allocationType,
                PAGE_READWRITE,
                memNode);
        }
        else if (largepages) {
            arr = VirtualAlloc(NULL, default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1] * 1024, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE);
            if (arr == NULL)
            {
                fprintf(stderr, "Failed to get memory via VirtualAlloc: %d\n", GetLastError());
                return -1;
            }
        }

        printf("Region,Latency (ns)\n");
        for (int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++)
        {
            printf("%d,%f\n", default_test_sizes[i], RunAsmTest(default_test_sizes[i], ITERATIONS, arr));
        }
    }

    printf("If you didn't run this through cmd, now you can copy the results");

    return 0;
}

/// <summary>
/// Heuristic to make sure test runs for enough time but not too long
/// </summary>
/// <param name="size_kb">Region size</param>
/// <param name="iterations">base iterations</param>
/// <returns>scaled iterations</returns>
uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) {
    return 10 * iterations / pow(size_kb, 1.0 / 4.0);
}

void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint32_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

void FillPatternArr64(uint64_t* pattern_arr, uint64_t list_size, uint64_t byte_increment) {
    uint32_t increment = byte_increment / sizeof(uint64_t);
    uint32_t element_count = list_size / increment;
    for (int i = 0; i < element_count; i++) {
        pattern_arr[i * increment] = i * increment;
    }

    int iter = element_count;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint64_t tmp = pattern_arr[iter * increment];
        pattern_arr[iter * increment] = pattern_arr[j * increment];
        pattern_arr[j * increment] = tmp;
    }
}

float RunAsmTest(uint32_t size_kb, uint64_t iterations, void* mem) {
    struct timeb start, end;
    uint32_t list_size = size_kb * 1024 / sizeof(void *);

    uint64_t* A;
    if (mem == NULL) {
        A = (uint64_t *)malloc(size_kb * 1024);
    }
    else {
        A = (uint64_t *)mem;
    }

    memset(A, 0, 1024 * size_kb);
    FillPatternArr64(A, size_kb * 1024 / sizeof(uint64_t), 64);
    preplatencyarr(A, size_kb * 1024 / sizeof(uint64_t));
    uint64_t scaled_iterations = scale_iterations(size_kb, iterations);

    ftime(&start);
    uint64_t sum = latencytest(scaled_iterations, A);
    ftime(&end);
    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (mem == NULL) free(A);

    if (sum == 0) printf("sum == 0 (???)\n");
    return latency;
}

float RunTest(uint32_t size_kb, uint64_t iterations, void *mem) {
    struct timeb start, end;
    uint32_t list_size = size_kb * 1024 / 4;
    uint32_t sum = 0, current;

    // Fill list to create random access pattern
    int* A;
    if (mem == NULL) {
        A = (int*)malloc(sizeof(int) * list_size);
    } else {
        A = (int*)mem;
    }

    for (int i = 0; i < list_size; i++) {
        A[i] = i;
    }

    int iter = list_size;
    while (iter > 1) {
        iter -= 1;
        int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
        uint32_t tmp = A[iter];
        A[iter] = A[j];
        A[j] = tmp;
    }

    uint64_t scaled_iterations = scale_iterations(size_kb, iterations);

    // Run test
    ftime(&start);
    current = A[0];
    for (int i = 0; i < scaled_iterations; i++) {
        current = A[current];
        sum += current;
    }
    ftime(&end);
    int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
    float latency = 1e6 * (float)time_diff_ms / (float)scaled_iterations;
    if (mem == NULL) free(A);

    if (sum == 0) printf("sum == 0 (???)\n");
    return latency;
}

bool GetPrivilege()
{
    HANDLE           hToken;
    TOKEN_PRIVILEGES tp;
    BOOL             status;
    DWORD            error;

    // open process token
    if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken))
    {
        fprintf(stderr, "OpenProcessToken failed: %d\n", GetLastError());
        return false;
    }

    // get the luid
    if (!LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid))
    {
        fprintf(stderr, "Could not get luid: %d\n", GetLastError());
        return false;
    }

    // enable privilege
    tp.PrivilegeCount = 1;
    tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
    status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0);

    // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed.
    // So always check for the last error value.
    error = GetLastError();
    if (!status || (error != ERROR_SUCCESS))
    {
        fprintf(stderr, "AdjustTokenPrivileges failed with status %d, error %d\n", status, error);
        return false;
    }

    // close the handle
    if (!CloseHandle(hToken))
    {
        fprintf(stderr, "CloseHandle failed: %d\n", GetLastError());
        return false;
    }

    fprintf(stderr, "Got SeLockMemoryPrivilege\n");
}


================================================
FILE: MemoryLatency/MemoryLatency.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.31229.75
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryLatency", "MemoryLatency.vcxproj", "{3A98A230-A87B-432D-931D-369872DE24AF}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.ActiveCfg = Debug|x64
		{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x64.Build.0 = Debug|x64
		{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.ActiveCfg = Debug|Win32
		{3A98A230-A87B-432D-931D-369872DE24AF}.Debug|x86.Build.0 = Debug|Win32
		{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.ActiveCfg = Release|x64
		{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x64.Build.0 = Release|x64
		{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.ActiveCfg = Release|Win32
		{3A98A230-A87B-432D-931D-369872DE24AF}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {F2D00DD2-A22B-4A3C-A2FF-9CE8CF9070D1}
	EndGlobalSection
EndGlobal


================================================
FILE: MemoryLatency/MemoryLatency.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>16.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{3a98a230-a87b-432d-931d-369872de24af}</ProjectGuid>
    <RootNamespace>MemoryLatency</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v142</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="MemoryLatency.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="MemoryLatencyFunctions.asm">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 MemoryLatencyFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MemoryLatencyFunctions.obj</Outputs>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>


================================================
FILE: MemoryLatency/MemoryLatencyFunctions.asm
================================================
section .text
bits 64

global preplatencyarr
global latencytest

preplatencyarr:
  push r15
  push r14
  xor r15, r15 ; array index
preplatencyarr_loop:
  mov r14, [rcx + r15 * 8]
  lea r14, [rcx + r14 * 8]
  mov [rcx + r15 * 8], r14
  inc r15
  cmp rdx, r15
  jne preplatencyarr_loop
  pop r14
  pop r15
  ret

latencytest:
  push r15
  mov r15, [rdx]
  xor rax, rax
latencytest_loop:
  mov r15, [r15]
  add rax, r15
  dec rcx
  jnz latencytest_loop
  pop r15
  ret


================================================
FILE: MemoryLatency/MemoryLatency_arm.s
================================================
.text

.global latencytest
.global longpatternlatencytest
.global preplatencyarr
.global stlftest
.global stlftest32
.global stlftest128
.global matchedstlftest

.global _latencytest
.global _longpatternlatencytest
.global _preplatencyarr
.global _stlftest
.global _stlftest32
.global _stlftest128
.global _matchedstlftest

.balign 4

/* x0 = ptr to arr
   x1 = arr len
   convert values in array from array indexes to pointers */
_preplatencyarr:
preplatencyarr:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x15, 0
preplatencyarr_loop:
  ldr x14, [x0, w15, uxtw #3]
  lsl x14, x14, 3
  add x14, x14, x0
  str x14, [x0, w15, uxtw #3]
  add w15, w15, 1
  cmp x15, x1
  b.ne preplatencyarr_loop
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

/* x0 = iteration count
   x1 = ptr to arr
   do pointer chasing for specified iteration count */
_latencytest:
latencytest:
  sub sp, sp, #0x20
  stp x14, x15, [sp, #0x10]
  mov x14, 0
  ldr x15, [x1]
latencytest_loop:
  ldr x15, [x15]
  add x14, x14, x15
  sub x0, x0, 1
  cbnz x0, latencytest_loop
  mov x0, x14
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x20
  ret

/* x0 = iteration count
   x1 = ptr to arr
   do pointer chasing with longer pattern, given different patterns
   within each cacheline */
_longpatternlatencytest:
longpatternlatencytest:
  sub sp, sp, #0x50
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]
  stp x10, x11, [sp, #0x30]
  stp x8, x9, [sp, #0x40]
  mov x14, 0
  ldr x15, [x1]
  mov x12, 63    /* mask for offset into cacheline */
  mvn x13, x12   /* mask for cacheline address comparison */
  and x10, x13, x1  /* x10 = cacheline address of first element */
longpatternlatencytest_loop:
  mov x9, x15
  ldr x15, [x15]

  /* if we're back at the first cacheline */
  and x11, x13, x15
  cmp x11, x10
  b.ne longpatternlatencytest_loop_inc
  add x14, x14, 8
  and x14, x14, x12
  and x15, x15, x13
  add x15, x15, x14  /* move to the next element within that cacheline */
longpatternlatencytest_loop_inc:
  sub x0, x0, 1
  cbnz x0, longpatternlatencytest_loop
  mov x0, x14
  ldp x8, x9, [sp, #0x40]
  ldp x10, x11, [sp, #0x30]
  ldp x12, x13, [sp, #0x20]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x50
  ret


/* x0 = iteration count
   x1 = ptr to arr. first 32-bit int = store offset, second = load offset */
_stlftest:
stlftest:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */
  ldr x15, [x1]
  ldr w12, [x1]
  ldr w13, [x1, 4]
  add x12, x12, x1
  add x13, x13, x1
stlftest_loop:
  str x15, [x12]
  ldr w15, [x13]
  str x15, [x12]
  ldr w15, [x13]
  str x15, [x12]
  ldr w15, [x13]
  str x15, [x12]
  ldr w15, [x13]
  str x15, [x12]
  ldr w15, [x13]
  sub x0, x0, 5
  cmp x0, 0
  b.gt stlftest_loop
  ldp x12, x13, [sp, #0x10]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret

_stlftest32:
stlftest32:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */
  ldr x15, [x1]
  ldr w12, [x1]
  ldr w13, [x1, 4]
  add x12, x12, x1
  add x13, x13, x1
stlftest32_loop:
  str w15, [x12]
  ldrh w15, [x13]
  str w15, [x12]
  ldrh w15, [x13]
  str w15, [x12]
  ldrh w15, [x13]
  str w15, [x12]
  ldrh w15, [x13]
  str w15, [x12]
  ldrh w15, [x13]
  sub x0, x0, 5
  cmp x0, 0
  b.gt stlftest32_loop
  ldp x12, x13, [sp, #0x10]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret

/* x0 = iteration count
   x1 = ptr to arr. first 32-bit int = store offset, second = load offset */
_stlftest128:
stlftest128:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */
  ldr x15, [x1]
  ldr w12, [x1]
  ldr w13, [x1, 4]
  add x12, x12, x1
  add x13, x13, x1
stlftest128_loop:
  str q15, [x12]
  ldr d15, [x13]
  str q15, [x12]
  ldr d15, [x13]
  str q15, [x12]
  ldr d15, [x13]
  str q15, [x12]
  ldr d15, [x13]
  str q15, [x12]
  ldr d15, [x13]
  sub x0, x0, 5
  cmp x0, 0
  b.gt stlftest128_loop
  ldp x12, x13, [sp, #0x10]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret

_matchedstlftest:
matchedstlftest:
  sub sp, sp, #0x40
  stp x14, x15, [sp, #0x10]
  stp x12, x13, [sp, #0x20]  /* x12 = store ptr, x13 = load ptr */
  ldr x15, [x1]
  ldr w12, [x1]
  ldr w13, [x1, 4]
  add x12, x12, x1
  add x13, x13, x1
matchedstlftest_loop:
  str x15, [x12]
  ldr x15, [x13]
  str x15, [x12]
  ldr x15, [x13]
  str x15, [x12]
  ldr x15, [x13]
  str x15, [x12]
  ldr x15, [x13]
  str x15, [x12]
  ldr x15, [x13]
  sub x0, x0, 5
  cmp x0, 0
  b.gt matchedstlftest_loop
  ldp x12, x13, [sp, #0x10]
  ldp x14, x15, [sp, #0x10]
  add sp, sp, #0x40
  ret


================================================
FILE: MemoryLatency/MemoryLatency_i686.s
================================================
.text

.global @latencytest@8
.global @preplatencyarr@8
.global @stlftest@8
.global @matchedstlftest@8
.global latencytest
.global preplatencyarr
.global stlftest
.global matchedstlftest

/* fastcall specified in source file, so
   ecx = ptr to arr
   edx = arr len
   convert values in array from array indexes to pointers
   there has to be a way to make C do this but high level
   programming languages suck and make simple things harder than they should be
*/
preplatencyarr:
@preplatencyarr@8:
  push %eax
  push %esi
  xor %esi, %esi    /* esi = array index */
preplatencyarr_loop:
  mov (%ecx,%esi,4), %eax   /* load target array index into eax */
  lea (%ecx,%eax,4), %eax   /* calculate target address -> eax */
  mov %eax, (%ecx,%esi,4)   /* replace array index with target address */
  inc %esi
  cmp %esi, %edx
  jne preplatencyarr_loop
  pop %esi
  pop %eax
  ret

/* ecx = iterations
   edx = ptr to arr
   do pointer chasing for specified iteration count
*/
latencytest:
@latencytest@8:
  push %esi
  mov (%edx), %esi
  xor %eax, %eax
latencytest_loop:
  mov (%esi), %esi
  add %esi, %eax
  dec %ecx
  jnz latencytest_loop
  pop %esi
  ret

/* ecx = iterations
   edx = ptr to array. first two 32-bit ints in array are store and load offsets respectively
   mismatch load and store sizes by using 16-bit loads and 32-bit stores
*/
stlftest:
@stlftest@8:
  push %esi
  push %edi
  mov (%edx), %eax   /* just get some value into rax (store value */
  mov (%edx), %esi
  mov 4(%edx), %edi
  add %edx, %esi     /* esi = store ptr */
  add %edx, %edi     /* edi = load ptr */
stlftest_loop:
  mov %eax, (%esi)   /* 32-bit store */
  mov (%edi), %ax   /* 16-bit load that possibly gets forwarded result */
  mov %eax, (%esi)
  mov (%edi), %ax
  mov %eax, (%esi)
  mov (%edi), %ax
  mov %eax, (%esi)
  mov (%edi), %ax
  mov %eax, (%esi)
  mov (%edi), %ax
  sub $5, %ecx
  jg stlftest_loop
  pop %edi
  pop %esi
  ret

matchedstlftest:
@matchedstlftest@8:
  push %esi
  push %edi
  mov (%edx), %eax   /* just get some value into rax (store value */
  mov (%edx), %esi
  mov 4(%edx), %edi
  add %edx, %esi     /* esi = store ptr */
  add %edx, %edi     /* edi = load ptr */
matchedstlftest_loop:
  mov %eax, (%esi)
  mov (%edi), %eax
  mov %eax, (%esi)
  mov (%edi), %eax
  mov %eax, (%esi)
  mov (%edi), %eax
  mov %eax, (%esi)
  mov (%edi), %eax
  mov %eax, (%esi)
  mov (%edi), %eax
  sub $5, %ecx
  jg matchedstlftest_loop
  pop %edi
  pop %esi
  ret


================================================
FILE: MemoryLatency/MemoryLatency_riscv.s
================================================
.text

.global latencytest
.global preplatencyarr
.global stlftest
.global stlftest32
.global stlftest128
.global matchedstlftest

/* x10 = ptr to arr
   x11 = arr len
   x5-x7, x28-31 are temporaries
   convert values in array from array indexes to pointers */
preplatencyarr:
  li x7, 0   /* index */
  mv x5, x10 /* x5 = pointer into array */
preplatencyarr_loop:
  ld x28, (x5)  /* x28 = index into array to translate */
  slli x28, x28, 3
  add x28, x28, x10
  sd x28, (x5)
  addi x5, x5, 8
  addi x7, x7, 1
  blt x7, x11, preplatencyarr_loop 
  ret

/* x10 = iteration count
   x11 = ptr to arr
   do pointer chasing for specified iteration count */
latencytest:
  li x7, 0 /* iteration count */
  mv x5, x11
  mv x6, x11
  addi x6, x6, 64
  li x28, 0
  li x29, 0
latencytest_loop:
  ld x5, (x5)
  addi x7, x7, 1
  blt x7, x10, latencytest_loop
  mv x10, x5
  ret

/* a0 = iteration count
   a1 = ptr to arr. first 32-bit int = store offset, second = load offset */
stlftest:
  lw t0, (a1)
  lw t1, 4(a1)
  add t0, t0, a1
  add t1, t1, a1
  mv t2, a0
  mv t3, x0
stlftest_loop:
  sd t2, (t0)
  lw t2, (t1)
  sd t2, (t0)
  lw t2, (t1) 
  sd t2, (t0)
  lw t2, (t1) 
  sd t2, (t0)
  lw t2, (t1) 
  sd t2, (t0)
  lw t2, (t1) 
  addi t3, t3, 5
  blt t3, a0, stlftest_loop
  ret

stlftest32:
  lw t0, (a1)
  lw t1, 4(a1)
  add t0, t0, a1
  add t1, t1, a1
  mv t3, x0
stlftest32_loop:
  sw t2, (t0)
  lh t2, (t1)
  sw t2, (t0)
  lh t2, (t1) 
  sw t2, (t0)
  lh t2, (t1) 
  sw t2, (t0)
  lh t2, (t1) 
  sw t2, (t0)
  lh t2, (t1)
  addi t3, t3, 5
  blt t3, a0, stlftest32_loop
  ret

/* since I'm only dealing with C910 and I know the vec len is 128... */
stlftest128:
  mv t4, x0
  addi t4, t4, 32     /* ??? */
  vsetvli t0, t4, e8  /* request vector length of some big value? always gives back 4 but not if we ask for 4*/
  lw t0, (a1)
  lw t1, 4(a1)
  add t0, t0, a1
  add t1, t1, a1
  mv t3, x0
stlftest128_loop:
  vsw.v v0, (t0)
  fld f0, (t1)
  vsw.v v0, (t0)
  fld f0, (t1) 
  vsw.v v0, (t0)
  fld f0, (t1) 
  vsw.v v0, (t0)
  fld f0, (t1) 
  vsw.v v0, (t0)
  fld f0, (t1)
  addi t3, t3, 5
  blt t3, a0, stlftest128_loop
  ret

matchedstlftest:
  lw t0, (a1)
  lw t1, 4(a1)
  add t0, t0, a1
  add t1, t1, a1
  mv t2, a0
  mv t3, x0
matchedstlftest_loop:
  sd t2, (t0)
  ld t2, (t1)
  sd t2, (t0)
  ld t2, (t1) 
  sd t2, (t0)
  ld t2, (t1) 
  sd t2, (t0)
  ld t2, (t1) 
  sd t2, (t0)
  ld t2, (t1) 
  addi t3, t3, 5
  blt t3, a0, matchedstlftest_loop
  ret


================================================
FILE: MemoryLatency/MemoryLatency_x86.s
================================================
.text

.global latencytest
.global longpatternlatencytest
.global preplatencyarr
.global stlftest
.global stlftest32
.global stlftest128
.global matchedstlftest

/* ms_abi specified in source file, so
   rcx = ptr to arr
   rdx = arr len
   convert values in array from array indexes to pointers
   there has to be a way to make C do this but high level
   programming languages suck and make simple things harder than they should be
*/
preplatencyarr:
  push %r15
  push %r14
  xor %r15, %r15    /* r15 = array index */
preplatencyarr_loop:
  mov (%rcx,%r15,8), %r14
  lea (%rcx,%r14,8), %r14
  mov %r14, (%rcx,%r15,8)
  inc %r15
  cmp %r15, %rdx
  jne preplatencyarr_loop
  pop %r14
  pop %r15
  ret

latencytest:
  push %r15
  mov (%rdx), %r15
  xor %rax, %rax
latencytest_loop:
  mov (%r15), %r15
  add %r15, %rax
  dec %rcx
  jnz latencytest_loop
  pop %r15
  ret

/* rcx = iterations
   rdx = ptr to arr
   do pointer chasing for specified iteration count
*/
longpatternlatencytest:
  push %r15
  push %r14
  push %r13
  push %r12
  push %rbx
  mov (%rdx), %r15
  xor %rax, %rax   /* rax = index into cacheline */

  /* r14 = 64B aligned start address */
  mov %rdx, %r14
  mov $63, %r13
  not %r13         /* r13 = mask for 64B cacheline addr */
  and %r13, %r14
longpatternlatencytest_loop:
  mov (%r15), %r15

  /* if we're back at the first cacheline */
  mov %r15, %r12
  and %r13, %r12
  cmp %r12, %r14
  jnz longpatternlatencytest_loop_inc
  add $8, %rax
  and $63, %rax
  and %r13, %r15
  add %rax, %r15
longpatternlatencytest_loop_inc:
  dec %rcx
  jnz longpatternlatencytest_loop
  pop %rbx
  pop %r12
  pop %r13
  pop %r14
  pop %r15
  ret

/* rcx = iterations
   rdx = ptr to array. first two 32-bit ints in array are store and load offsets respectively */
stlftest:
  push %rsi
  push %rdi
  mov (%rdx), %rax   /* just get some value into rax (store value */
  mov (%rdx), %esi
  mov 4(%rdx), %edi
  add %rdx, %rsi     /* rsi = store ptr */
  add %rdx, %rdi     /* rdi = load ptr */
stlftest_loop:
  mov %rax, (%rsi)   /* store */
  mov (%rdi), %eax   /* load that possibly gets forwarded result */
  mov %rax, (%rsi)
  mov (%rdi), %eax
  mov %rax, (%rsi)
  mov (%rdi), %eax
  mov %rax, (%rsi)
  mov (%rdi), %eax
  mov %rax, (%rsi)
  mov (%rdi), %eax
  sub $5, %rcx
  jg stlftest_loop
  pop %rdi
  pop %rsi
  ret

stlftest128:
  push %rsi
  push %rdi
  mov (%rdx), %rax   /* just get some value into rax (store value */
  mov (%rdx), %esi
  mov 4(%rdx), %edi
  add %rdx, %rsi     /* rsi = store ptr */
  add %rdx, %rdi     /* rdi = load ptr */
stlftest128_loop:
  movups %xmm0, (%rsi)   /* store */
  movsd (%rdi), %xmm0   /* load that possibly gets forwarded result */
  movups %xmm0, (%rsi)
  movsd (%rdi), %xmm0
  movups %xmm0, (%rsi)
  movsd (%rdi), %xmm0
  movups %xmm0, (%rsi)
  movsd (%rdi), %xmm0
  movups %xmm0, (%rsi)
  movsd (%rdi), %xmm0
  sub $5, %rcx
  jg stlftest128_loop
  pop %rdi
  pop %rsi
  ret

stlftest32:
  push %rsi
  push %rdi
  mov (%rdx), %rax   /* just get some value into rax (store value */
  mov (%rdx), %esi
  mov 4(%rdx), %edi
  add %rdx, %rsi     /* rsi = store ptr */
  add %rdx, %rdi     /* rdi = load ptr */
stlftest32_loop:
  mov %eax, (%rsi)   /* store */
  mov (%rdi), %ax    /* load that possibly gets forwarded result */
  mov %eax, (%rsi)
  mov (%rdi), %ax
  mov %eax, (%rsi)
  mov (%rdi), %ax
  mov %eax, (%rsi)
  mov (%rdi), %ax
  mov %eax, (%rsi)
  mov (%rdi), %ax
  sub $5, %rcx
  jg stlftest32_loop
  pop %rdi
  pop %rsi
  ret

matchedstlftest:
  push %rsi
  push %rdi
  mov (%rdx), %rax   /* just get some value into rax (store value */
  mov (%rdx), %esi
  mov 4(%rdx), %edi
  add %rdx, %rsi     /* rsi = store ptr */
  add %rdx, %rdi     /* rdi = load ptr */
matchedstlftest_loop:
  mov %rax, (%rsi)   /* store */
  mov (%rdi), %rax   /* load that possibly gets forwarded result */
  mov %rax, (%rsi)
  mov (%rdi), %rax
  mov %rax, (%rsi)
  mov (%rdi), %rax
  mov %rax, (%rsi)
  mov (%rdi), %rax
  mov %rax, (%rsi)
  mov (%rdi), %rax
  sub $5, %rcx
  jg matchedstlftest_loop
  pop %rdi
  pop %rsi
  ret


================================================
FILE: MemoryLatency/README.md
================================================
# Memory Latency Test

This test measures random memory access latency within increasing array sizes, and (hopefully) shows the latency and size of caches as well as memory latency. Modes, passed as the first parameter:
- (no parameter) - Uses plain C code and `current = A[current]` to measure latency
- asm - Uses `mov r15, [r15]` for x86-64 or `ldr x15, [x15]`. This can help accurately measure L1D latency, because many x86 CPUs take an extra cycle to calculate "complex" addresses. And compilers like to do that for the plain C version above. This doesn't seem to make a difference for ARM
- tlb - Accesses just one element per 4 KB region to measure virtual to physical address translation latency (so TLBs and page walkers). Cache latency is subtracted out to isolate address translation latency.

# Building

Make sure optimization is on, or L1D latencies may be quite a bit higher than expected.

## Windows
Under WSL, do `x86_64-w64-mingw32-gcc-win32 -O3 MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency.exe`

Run with
`MemoryLatency.exe`
`MemoryLatency.exe asm`
`MemoryLatency.exe tlb`
## Linux, x86-64
`gcc -O3 MemoryLatency.c MemoryLatency_x86.s -o MemoryLatency`

## Linux/Android+Termux, aarch64
`gcc -O3 MemoryLatency.c MemoryLatency_arm.s -o MemoryLatency`

## Linux, riscv64
`gcc -O3 MemoryLatency.c MemoryLatency_riscv.s -o MemoryLatency`

## VS version
Open solution and build. This is only around to hit large pages on Windows. 

# Running (Linux/Cross-Compiled Version)
- `./MemoryLatency -test asm` Tests cache and memory latency with the default page size
- `./MemoryLatency -test asm -hugepages` Tests cache and memory latency with huge pages, which should minimize address translation penalties. You'll need to `echo (page count) > /proc/sys/vm/nr_hugepages` or have a kernel capable of doing transparent hugepages via madvise.
- `./MemoryLatency -test tlb` Roughly estimates address translation penalties. Currently only good for measuring L2 TLB hit latency.
- `./MemoryLatency -test stlf` An implementation of the test described at https://blog.stuffedcow.net/2014/01/x86-memory-disambiguation/ for measuring store to load forwarding latency, described under the "fast address" section
- `./MemoryLatency -test 128_stlf` Henry Wong's store to load forwarding latency test but with 128-bit vector loads and 64-bit stores with vector/FP registers. On some CPUs, this can show different behavior to the STLF test above, which uses 64-bit loads and 32-bit stores on the scalar integer side. 


================================================
FILE: README.md
================================================
# Microbenchmarks
Trying to figure various CPU (or GPU) things out.

Basically my playground to microbenchmark various CPU-related things like ROB/register file sizes, lock/cache coherency latency, and cache/memory performance. This repo is loose collection of various experiments and is more of a playground than a well maintained piece of software. As such, various benchmarks may not work, or may not even compile. They're also not well documented and details of what's being tested may not be intuitive. Due to time constraints and real life priorities I won't be able to maintain this repo to an acceptable standard for public use.

Feel free to try running the stuff here, but I highly suggest writing your own code because that'll provide a better understanding of the theory behind the benchmarks. Consider checking out https://github.com/travisdowns/robsize or https://github.com/Veedrac/microarchitecturometer.

# Building Clammicrobench with Generated Code
Get NASM (https://www.nasm.us/) and make sure it's in your path. Then things should build under Visual Studio 2022.

Some microbenchmarks have the source code and assembly generated by C# code, to avoid crazy stuff like self modifying code. For clammicrobench, build/run the AsmGen project. Pass "autocopy" on the command line to have it automatically place generated ASM files for Visual Studio. Then, the clammicrobench project should build.


================================================
FILE: mt_instructionrate/InstructionRateFunctions.asm
================================================
section .text

bits 64

global sse_int32_add_test
global sse_int32_mul_test
global sse_int64_add_test
global sse_int64_mul_test
global avx2_int32_add_test
global avx2_int32_mul_test
global avx2_int64_add_test
global avx2_int64_mul_test
global sse_fp32_add_test
global sse_fp32_mul_test
global sse_fp32_muladd_test
global sse_fp32_rsqrt_test
global avx_fp32_add_test
global avx_fp32_mul_test
global avx_fp32_muladd_test
global avx_fp32_rsqrt_test
global fp32_fma_test
global fp64_fma_test

global sse_fp64_add_test
global sse_fp64_mul_test
global sse_fp64_muladd_test
global avx_fp64_add_test
global avx_fp64_mul_test
global avx_fp64_muladd_test

global avx512_int32_add_test
global avx512_int32_mul_test
global avx512_int64_add_test
global avx512_int64_mul_test
global avx512_fp32_rsqrt_test
global avx512_fp32_add_test
global avx512_fp32_fma_test
global avx512_fp64_add_test
global avx512_fp64_fma_test

sse_int32_add_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int32_add_test_loop:
  paddd xmm0, xmm0
  paddd xmm1, xmm1
  paddd xmm2, xmm2
  paddd xmm3, xmm3
  paddd xmm4, xmm4
  paddd xmm5, xmm5
  sub rcx, 24
  cmp rcx, 0
  jg sse_int32_add_test_loop
  ret

sse_int64_add_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int64_add_test_loop:
  paddq xmm0, xmm0
  paddq xmm1, xmm1
  paddq xmm2, xmm2
  paddq xmm3, xmm3
  paddq xmm4, xmm4
  paddq xmm5, xmm5
  sub rcx, 12
  jg sse_int64_add_test_loop
  ret

sse_int32_mul_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int32_mul_test_loop:
  pmulld xmm0, xmm0
  pmulld xmm1, xmm1
  pmulld xmm2, xmm2
  pmulld xmm3, xmm3
  pmulld xmm4, xmm4
  pmulld xmm5, xmm5
  sub rcx, 24
  jg sse_int32_mul_test_loop
  ret

sse_int64_mul_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int64_mul_test_loop:
  pmuludq xmm0, xmm0
  pmuludq xmm1, xmm1
  pmuludq xmm2, xmm2
  pmuludq xmm3, xmm3
  pmuludq xmm4, xmm4
  pmuludq xmm5, xmm5
  sub rcx, 12
  jg sse_int64_mul_test_loop
  ret

avx2_int32_add_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int32_add_test_loop:
  vpaddd ymm0, ymm0, ymm0
  vpaddd ymm1, ymm1, ymm1
  vpaddd ymm2, ymm2, ymm2
  vpaddd ymm3, ymm3, ymm3
  vpaddd ymm4, ymm4, ymm4
  vpaddd ymm5, ymm5, ymm5
  sub rcx, 48
  cmp rcx, 0
  jg avx2_int32_add_test_loop
  ret

avx2_int32_mul_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int32_mul_test_loop:
  vpmulld ymm0, ymm0, ymm0
  vpmulld ymm1, ymm1, ymm1
  vpmulld ymm2, ymm2, ymm2
  vpmulld ymm3, ymm3, ymm3
  vpmulld ymm4, ymm4, ymm4
  vpmulld ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx2_int32_mul_test_loop
  ret

avx2_int64_add_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int64_add_test_loop:
  vpaddq ymm0, ymm0, ymm0
  vpaddq ymm1, ymm1, ymm1
  vpaddq ymm2, ymm2, ymm2
  vpaddq ymm3, ymm3, ymm3
  vpaddq ymm4, ymm4, ymm4
  vpaddq ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx2_int64_add_test_loop
  ret

avx2_int64_mul_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int64_mul_test_loop:
  vpmuldq ymm0, ymm0, ymm0
  vpmuldq ymm1, ymm1, ymm1
  vpmuldq ymm2, ymm2, ymm2
  vpmuldq ymm3, ymm3, ymm3
  vpmuldq ymm4, ymm4, ymm4
  vpmuldq ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx2_int64_mul_test_loop
  ret

sse_fp32_add_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_add_test_loop:
  addps xmm0, xmm0
  addps xmm1, xmm1
  addps xmm2, xmm2
  addps xmm3, xmm3
  addps xmm4, xmm4
  addps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_add_test_loop
  ret

sse_fp64_add_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_add_test_loop:
  addpd xmm0, xmm0
  addpd xmm1, xmm1
  addpd xmm2, xmm2
  addpd xmm3, xmm3
  addpd xmm4, xmm4
  addpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_add_test_loop
  ret

sse_fp32_mul_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_mul_test_loop:
  mulps xmm0, xmm0
  mulps xmm1, xmm1
  mulps xmm2, xmm2
  mulps xmm3, xmm3
  mulps xmm4, xmm4
  mulps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_mul_test_loop
  ret

sse_fp64_mul_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_mul_test_loop:
  mulpd xmm0, xmm0
  mulpd xmm1, xmm1
  mulpd xmm2, xmm2
  mulpd xmm3, xmm3
  mulpd xmm4, xmm4
  mulpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_mul_test_loop
  ret

sse_fp32_muladd_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_muladd_test_loop:
  mulps xmm0, xmm0
  addps xmm0, xmm0
  mulps xmm1, xmm1
  addps xmm1, xmm1
  mulps xmm2, xmm2
  addps xmm2, xmm2
  mulps xmm3, xmm3
  addps xmm3, xmm3
  mulps xmm4, xmm4
  addps xmm4, xmm4
  mulps xmm5, xmm5
  addps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_muladd_test_loop
  ret

sse_fp32_rsqrt_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_rsqrt_test_loop:
  rsqrtps xmm0, xmm0
  rsqrtps xmm1, xmm1
  rsqrtps xmm2, xmm2
  rsqrtps xmm3, xmm3
  rsqrtps xmm4, xmm4
  rsqrtps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_rsqrt_test_loop
  ret

avx_fp32_rsqrt_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_rsqrt_test_loop:
  vrsqrtps ymm0, ymm0
  vrsqrtps ymm1, ymm1
  vrsqrtps ymm2, ymm2
  vrsqrtps ymm3, ymm3
  vrsqrtps ymm4, ymm4
  vrsqrtps ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_rsqrt_test_loop
  ret

sse_fp64_muladd_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_muladd_test_loop:
  mulpd xmm0, xmm0
  addpd xmm0, xmm0
  mulpd xmm1, xmm1
  addpd xmm1, xmm1
  mulpd xmm2, xmm2
  addpd xmm2, xmm2
  mulpd xmm3, xmm3
  addpd xmm3, xmm3
  mulpd xmm4, xmm4
  addpd xmm4, xmm4
  mulpd xmm5, xmm5
  addpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_muladd_test_loop
  ret

avx_fp32_add_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_add_test_loop:
  vaddps ymm0, ymm0, ymm0
  vaddps ymm1, ymm1, ymm1
  vaddps ymm2, ymm2, ymm2
  vaddps ymm3, ymm3, ymm3
  vaddps ymm4, ymm4, ymm4
  vaddps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_add_test_loop
  ret

avx_fp64_add_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_add_test_loop:
  vaddpd ymm0, ymm0, ymm0
  vaddpd ymm1, ymm1, ymm1
  vaddpd ymm2, ymm2, ymm2
  vaddpd ymm3, ymm3, ymm3
  vaddpd ymm4, ymm4, ymm4
  vaddpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_add_test_loop
  ret

avx_fp32_mul_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_mul_test_loop:
  vmulps ymm0, ymm0, ymm0
  vmulps ymm1, ymm1, ymm1
  vmulps ymm2, ymm2, ymm2
  vmulps ymm3, ymm3, ymm3
  vmulps ymm4, ymm4, ymm4
  vmulps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_mul_test_loop
  ret

avx_fp64_mul_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_mul_test_loop:
  vmulpd ymm0, ymm0, ymm0
  vmulpd ymm1, ymm1, ymm1
  vmulpd ymm2, ymm2, ymm2
  vmulpd ymm3, ymm3, ymm3
  vmulpd ymm4, ymm4, ymm4
  vmulpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_mul_test_loop
  ret

avx_fp32_muladd_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_muladd_test_loop:
  vmulps ymm0, ymm0, ymm0
  vaddps ymm0, ymm0, ymm0
  vmulps ymm1, ymm1, ymm1
  vaddps ymm1, ymm1, ymm1
  vmulps ymm2, ymm2, ymm2
  vaddps ymm2, ymm2, ymm2
  vmulps ymm3, ymm3, ymm3
  vaddps ymm3, ymm3, ymm3
  vmulps ymm4, ymm4, ymm4
  vaddps ymm4, ymm4, ymm4
  vmulps ymm5, ymm5, ymm5
  vaddps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_muladd_test_loop
  ret

avx_fp64_muladd_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_muladd_test_loop:
  vmulpd ymm0, ymm0, ymm0
  vaddpd ymm0, ymm0, ymm0
  vmulpd ymm1, ymm1, ymm1
  vaddpd ymm1, ymm1, ymm1
  vmulpd ymm2, ymm2, ymm2
  vaddpd ymm2, ymm2, ymm2
  vmulpd ymm3, ymm3, ymm3
  vaddpd ymm3, ymm3, ymm3
  vmulpd ymm4, ymm4, ymm4
  vaddpd ymm4, ymm4, ymm4
  vmulpd ymm5, ymm5, ymm5
  vaddpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_muladd_test_loop
  ret

fp32_fma_test:
  vzeroall
  vmovups ymm0, [rdx]
  vmovups ymm1, ymm0
  vmovups ymm2, ymm0
  vmovups ymm3, ymm0
  vmovups ymm4, ymm0
  vmovups ymm5, ymm0
  vmovups ymm6, ymm0
fp32_fma_test_loop:
  vfmadd132ps ymm0, ymm0, ymm6
  vfmadd132ps ymm1, ymm1, ymm6
  vfmadd132ps ymm2, ymm2, ymm6
  vfmadd132ps ymm3, ymm3, ymm6
  vfmadd132ps ymm4, ymm4, ymm6
  vfmadd132ps ymm5, ymm5, ymm6
  sub rcx, 48
  jg fp32_fma_test_loop
  ret

fp64_fma_test:
  vzeroall
  vmovups ymm0, [rdx]
  vmovups ymm1, ymm0
  vmovups ymm2, ymm0
  vmovups ymm3, ymm0
  vmovups ymm4, ymm0
  vmovups ymm5, ymm0
  vmovups ymm6, ymm0
fp64_fma_test_loop:
  vfmadd132pd ymm0, ymm0, ymm6
  vfmadd132pd ymm1, ymm1, ymm6
  vfmadd132pd ymm2, ymm2, ymm6
  vfmadd132pd ymm3, ymm3, ymm6
  vfmadd132pd ymm4, ymm4, ymm6
  vfmadd132pd ymm5, ymm5, ymm6
  sub rcx, 24
  jg fp64_fma_test_loop
  ret

avx512_int32_add_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int32_add_test_loop:
  vpaddd zmm0, zmm0, zmm0
  vpaddd zmm1, zmm1, zmm1
  vpaddd zmm2, zmm2, zmm2
  vpaddd zmm3, zmm3, zmm3
  vpaddd zmm4, zmm4, zmm4
  vpaddd zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_int32_add_test_loop
  ret

avx512_int32_mul_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int32_mul_test_loop:
  vpmulld zmm0, zmm0, zmm0
  vpmulld zmm1, zmm1, zmm1
  vpmulld zmm2, zmm2, zmm2
  vpmulld zmm3, zmm3, zmm3
  vpmulld zmm4, zmm4, zmm4
  vpmulld zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_int32_mul_test_loop
  ret

avx512_int64_add_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int64_add_test_loop:
  vpaddq zmm0, zmm0, zmm0
  vpaddq zmm1, zmm1, zmm1
  vpaddq zmm2, zmm2, zmm2
  vpaddq zmm3, zmm3, zmm3
  vpaddq zmm4, zmm4, zmm4
  vpaddq zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_int64_add_test_loop
  ret

avx512_int64_mul_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int64_mul_test_loop:
  vpmuldq zmm0, zmm0, zmm0
  vpmuldq zmm1, zmm1, zmm1
  vpmuldq zmm2, zmm2, zmm2
  vpmuldq zmm3, zmm3, zmm3
  vpmuldq zmm4, zmm4, zmm4
  vpmuldq zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_int64_mul_test_loop
  ret

avx512_fp32_rsqrt_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_rsqrt_test_loop:
  vrsqrt14ps zmm0, zmm0
  vrsqrt14ps zmm1, zmm1
  vrsqrt14ps zmm2, zmm2
  vrsqrt14ps zmm3, zmm3
  vrsqrt14ps zmm4, zmm4
  vrsqrt14ps zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_rsqrt_test_loop
  ret

avx512_fp32_add_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_add_test_loop:
  vaddps zmm0, zmm0, zmm0
  vaddps zmm1, zmm1, zmm1
  vaddps zmm2, zmm2, zmm2
  vaddps zmm3, zmm3, zmm3
  vaddps zmm4, zmm4, zmm4
  vaddps zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_add_test_loop
  ret

avx512_fp32_fma_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_fma_test_loop:
  vfmadd132ps zmm0, zmm0, zmm0
  vfmadd132ps zmm1, zmm1, zmm1
  vfmadd132ps zmm2, zmm2, zmm2
  vfmadd132ps zmm3, zmm3, zmm3
  vfmadd132ps zmm4, zmm4, zmm4
  vfmadd132ps zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_fma_test_loop
  ret

avx512_fp64_add_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp64_add_test_loop:
  vfmadd132pd zmm0, zmm0, zmm0
  vfmadd132pd zmm1, zmm1, zmm1
  vfmadd132pd zmm2, zmm2, zmm2
  vfmadd132pd zmm3, zmm3, zmm3
  vfmadd132pd zmm4, zmm4, zmm4
  vfmadd132pd zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_fp64_add_test_loop
  ret

avx512_fp64_fma_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp64_fma_test_loop:
  vfmadd132ps zmm0, zmm0, zmm0
  vfmadd132ps zmm1, zmm1, zmm1
  vfmadd132ps zmm2, zmm2, zmm2
  vfmadd132ps zmm3, zmm3, zmm3
  vfmadd132ps zmm4, zmm4, zmm4
  vfmadd132ps zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_fp64_fma_test_loop
  ret

================================================
FILE: mt_instructionrate/Makefile
================================================
x86:
	gcc -pthread -masm=intel x86_mt_instructionrate.s mt_instructionrate.c ../Common/timing.c -o x86_mt_instructionrate -static
aarch64:
	gcc -pthread mt_instructionrate.c arm_mt_instructionrate.s ../Common/timing.c -o arm_mt_instructionrate
ppc64:
	gcc -pthread -mregnames mt_instructionrate.c ppc64_mt_instructionrate.s ../Common/timing.c -o ppc64_mt_instructionrate


================================================
FILE: mt_instructionrate/Project1.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>17.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{0ad46eb5-549e-4e36-9cea-89d06cee1b5e}</ProjectGuid>
    <RootNamespace>Project1</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
    <ProjectName>mt_instructionrate</ProjectName>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="..\Common\timing.c" />
    <ClCompile Include="mt_instructionrate.c" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\timing.h" />
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="InstructionRateFunctions.asm">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">nasm -f win64 InstructionRateFunctions.asm</Command>
      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Building asm functions</Message>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">InstructionRateFunctions.obj</Outputs>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">nasm -f win64 InstructionRateFunctions.asm</Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">InstructionRateFunctions.obj</Outputs>
      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Building asm functions</Message>
    </CustomBuild>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: mt_instructionrate/Project1.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="mt_instructionrate.c">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\Common\timing.c">
      <Filter>Header Files</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\timing.h">
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CustomBuild Include="InstructionRateFunctions.asm">
      <Filter>Source Files</Filter>
    </CustomBuild>
  </ItemGroup>
</Project>

================================================
FILE: mt_instructionrate/arm_mt_instructionrate.c
================================================
extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_rsqrt_test(uint64_t iterations, void *data);
extern uint64_t vec_fp64_add_test(uint64_t iterations, void *data);
extern uint64_t vec_fp64_fma_test(uint64_t iterations, void *data);
extern uint64_t vec_int64_add_test(uint64_t iterations, void *data);
//extern uint64_t vec_int64_mul_test(uint64_t iterations, void *data);


void RunTests() {
  uint64_t iterations = 3500000000;
  int testDataLength = 256; 
  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
  uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
  uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength);
  double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength);
  for (int i = 0; i < testDataLength; i++) {
    intTestArr[i] = i;
    fpTestArr[i] = i * 1.2f;
    int64TestArr[i] = i * 2;
    fp64TestArr[i] = 2.0f + 0.01f * i;
  }

  fprintf(stderr, "Measuring INT32 adds\n");
  float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);
  fprintf(stderr, "Measuring INT32 multiplies\n");
  float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);
  fprintf(stderr, "Measuring FP32 adds\n");
  float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);
  fprintf(stderr, "Measuring FP32 FMAs\n");
  float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);
  fprintf(stderr, "Measuring FP32 inverse square roots\n");
  float fp32rsqrts = measureFunction(iterations, vec_fp32_rsqrt_test, fpTestArr);

  fprintf(stderr, "Measuring INT64 Adds\n");
  float int64adds = measureFunction(iterations, vec_int64_add_test, int64TestArr);
  //fprintf(stderr, "Measuring INT64 Multiplies\n");
  //float int64muls = measureFunction(iterations, vec_int64_mul_test, int64TestArr);
  fprintf(stderr, "Measuring FP64 Adds\n");
  float fp64adds = measureFunction(iterations, vec_fp64_add_test, fp64TestArr);
  fprintf(stderr, "Measuring FP64 FMAs\n");
  float fp64fmas = measureFunction(iterations, vec_fp64_fma_test, fp64TestArr);

  printf("-----GOPS/s-----\n");
  printf("INT32 Add: %f\n", int32adds);
  printf("INT32 Multiply: %f\n", int32muls);
  printf("FP32 Add: %f\n", fp32adds);
  printf("FP32 FMA: %f\n", fp32fmas);
  printf("FP32 Inverse Square Roots: %f\n", fp32rsqrts);
  printf("INT64 Adds: %f\n", int64adds);
  //printf("INT64 Multiply: %f\n", int64muls);
  printf("FP64 Adds: %f\n", fp64adds);
  printf("FP64 FMAs: %f\n", fp64fmas);

  free(intTestArr);
  free(fpTestArr);
  return;
}


================================================
FILE: mt_instructionrate/arm_mt_instructionrate.s
================================================
.text

.global vec_int32_add_test
.global vec_int32_mul_test
.global vec_fp32_add_test
.global vec_fp32_fma_test
.global vec_fp32_rsqrt_test
.global vec_int64_add_test
/*.global vec_int64_mul_test*/
.global vec_fp64_add_test
.global vec_fp64_fma_test

/* x0 = iteration count, x1 = data */
vec_int32_add_test:
  mov x14, 24
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_int32_add_test_loop:
  add v16.4s, v16.4s, v16.4s
  add v17.4s, v17.4s, v17.4s
  add v18.4s, v18.4s, v18.4s
  add v19.4s, v19.4s, v19.4s
  add v20.4s, v20.4s, v20.4s
  add v21.4s, v21.4s, v21.4s 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_int32_add_test_loop
  ret

vec_int32_mul_test:
  mov x14, 24
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_int32_mul_test_loop:
  mul v16.4s, v16.4s, v16.4s
  mul v17.4s, v17.4s, v17.4s
  mul v18.4s, v18.4s, v18.4s
  mul v19.4s, v19.4s, v19.4s
  mul v20.4s, v20.4s, v20.4s
  mul v21.4s, v21.4s, v21.4s 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_int32_mul_test_loop
  ret

vec_fp32_add_test:
  mov x14, 24
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_fp32_add_test_loop:
  fadd v16.4s, v16.4s, v16.4s
  fadd v17.4s, v17.4s, v17.4s
  fadd v18.4s, v18.4s, v18.4s
  fadd v19.4s, v19.4s, v19.4s
  fadd v20.4s, v20.4s, v20.4s
  fadd v21.4s, v21.4s, v21.4s 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_fp32_add_test_loop
  ret

vec_fp32_fma_test:
  mov x14, 24
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_fp32_fma_test_loop:
  fmla v16.4s, v16.4s, v16.4s
  fmla v17.4s, v17.4s, v17.4s
  fmla v18.4s, v18.4s, v18.4s
  fmla v19.4s, v19.4s, v19.4s
  fmla v20.4s, v20.4s, v20.4s
  fmla v21.4s, v21.4s, v21.4s 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_fp32_fma_test_loop
  ret

vec_fp32_rsqrt_test:
  mov x14, 24
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_fp32_rsqrt_test_loop:
  ursqrte v16.4s, v16.4s
  ursqrte v17.4s, v17.4s
  ursqrte v18.4s, v18.4s
  ursqrte v19.4s, v19.4s
  ursqrte v20.4s, v20.4s
  ursqrte v21.4s, v21.4s 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_fp32_rsqrt_test_loop
  ret 

vec_int64_add_test:
  mov x14, 12
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_int64_add_test_loop:
  add v16.2d, v16.2d, v16.2d
  add v17.2d, v17.2d, v17.2d
  add v18.2d, v18.2d, v18.2d
  add v19.2d, v19.2d, v19.2d
  add v20.2d, v20.2d, v20.2d
  add v21.2d, v21.2d, v21.2d 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_int64_add_test_loop
  ret 
 
/*vec_int64_mul_test:
  mov x14, 12
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_int64_mul_test_loop:
  mul v16.2d, v16.2d, v16.2d
  mul v17.2d, v17.2d, v17.2d
  mul v18.2d, v18.2d, v18.2d
  mul v19.2d, v19.2d, v19.2d
  mul v20.2d, v20.2d, v20.2d
  mul v21.2d, v21.2d, v21.2d 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_int64_mul_test_loop
  ret */

vec_fp64_add_test:
  mov x14, 12
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_fp64_add_test_loop:
  fadd v16.2d, v16.2d, v16.2d
  fadd v17.2d, v17.2d, v17.2d
  fadd v18.2d, v18.2d, v18.2d
  fadd v19.2d, v19.2d, v19.2d
  fadd v20.2d, v20.2d, v20.2d
  fadd v21.2d, v21.2d, v21.2d 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_fp64_add_test_loop
  ret 

vec_fp64_fma_test:
  mov x14, 12
  ldr q16, [x1]
  ldr q17, [x1]
  ldr q18, [x1]
  ldr q19, [x1]
  ldr q20, [x1]
  ldr q21, [x1] 
vec_fp64_fma_test_loop:
  fmla v16.2d, v16.2d, v16.2d
  fmla v17.2d, v17.2d, v17.2d
  fmla v18.2d, v18.2d, v18.2d
  fmla v19.2d, v19.2d, v19.2d
  fmla v20.2d, v20.2d, v20.2d
  fmla v21.2d, v21.2d, v21.2d 
  sub x0, x0, x14
  cmp x0, 0
  b.gt vec_fp64_fma_test_loop
  ret


================================================
FILE: mt_instructionrate/mt_instructionrate.c
================================================
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <string.h>

#ifndef _MSC_VER
#include <pthread.h>
#include <unistd.h>
#include <sys/syscall.h>
#ifdef __x86_64
#define SMALLKITTEN __attribute__((ms_abi))
#else
#define SMALLKITTEN
#endif
#define gettid() ((pid_t)syscall(SYS_gettid))
#else 
#include <Windows.h>
#define SMALLKITTEN
#define _CRT_SECURE_NO_WARNINGS
#endif
#include "../Common/timing.h"


struct TestThreadData {
    float timeMs;  // written by thread to indicate elapsed runtime for that thread
    uint64_t iterations;
    void *testData;
    int core;     // -1 = don't set affinity. otherwise set affinity to specified core
    uint64_t (*testfunc)(uint64_t, void *) SMALLKITTEN;
};

float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *) SMALLKITTEN, void *data);
void *TestThread(void *param);

int threadCount = 1;
int *coreList = NULL;

#ifdef __aarch64__
#include "arm_mt_instructionrate.c"
#endif 

#ifdef __x86_64
#include "x86_mt_instructionrate.c"
#endif

#ifdef _MSC_VER
#include "x86_mt_instructionrate.c"
#endif


#ifdef __PPC64__
#include "ppc64_mt_instructionrate.c"
#endif

int main(int argc, char *argv[]) {
   char parseBuffer[512];
   int parseIndices[64];

   for (int argIdx = 1; argIdx < argc; argIdx++) {
      if (*(argv[argIdx]) == '-') {
        char *arg = argv[argIdx] + 1;
	if (strncmp(arg, "threads", 7) == 0) {
	  argIdx++;
	  threadCount = atoi(argv[argIdx]);
	  fprintf(stderr, "Using first %d cores\n", threadCount);
	} else if (strncmp(arg, "cores", 5) == 0) {
	  argIdx++;
	  
	  // whatever just parse it here
	  strncpy(parseBuffer, argv[argIdx], 511);
          parseIndices[0] = 0;
          int indexIdx = 1;
          threadCount = 1;
          for (int i = 0; i < 512 && indexIdx < 64; i++) {
            if (parseBuffer[i] == ',') {
              parseBuffer[i] = '\0';
              parseIndices[indexIdx] = i + 1;
              indexIdx++;
              threadCount++;
            }
          }

          coreList = malloc(sizeof(int) * threadCount);

          fprintf(stderr, "Using %d cores:", threadCount);
          for (int i = 0;i < threadCount; i++) {
            coreList[i] = atoi(parseBuffer + parseIndices[i]);
            fprintf(stderr, " %d", coreList[i]); 
          }

          fprintf(stderr, "\n");
	}
      }
   }

   RunTests();

   free(coreList);
   return 0;
}

// return billion operations per second
// test function must perform iterations ops
float measureFunction(uint64_t baseIterations, uint64_t (*testFunc)(uint64_t, void *) SMALLKITTEN, void *data){
  int toleranceMet = 0, minTimeMet = 0;
  unsigned int timeMs;
  
  struct TestThreadData *testData = (struct TestThreadData *)malloc(threadCount * sizeof(struct TestThreadData));
  for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
    testData[threadIdx].iterations = baseIterations;
    testData[threadIdx].testData = data;
    testData[threadIdx].testfunc = testFunc;
    if (coreList == NULL) testData[threadIdx].core = threadIdx;
    else testData[threadIdx].core = coreList[threadIdx];
  }

#ifndef _MSC_VER
  pthread_t* testThreads = (pthread_t*)malloc(threadCount * sizeof(pthread_t));
#else
  HANDLE* testThreads = (HANDLE*)malloc(threadCount * sizeof(HANDLE));
#endif

  do {
    start_timing();
    for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
#ifndef _MSC_VER
      pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);
#else
      testThreads[threadIdx] = CreateThread(NULL, 0, TestThread, testData + threadIdx, CREATE_SUSPENDED, NULL, NULL);
      SetThreadAffinityMask(testThreads[threadIdx], 1UL << testData[threadIdx].core);
      ResumeThread(testThreads[threadIdx]);
#endif
    }

    float maxThreadTime = -1, minThreadTime = -1;
    for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
#ifndef _MSC_VER
      pthread_join(testThreads[threadIdx], NULL);
#else
      WaitForMultipleObjects((DWORD)threadCount, testThreads, TRUE, INFINITE);
#endif
      fprintf(stderr, "Thread %d took %f ms\n", threadIdx, testData[threadIdx].timeMs);
      if (maxThreadTime < 0 || testData[threadIdx].timeMs > maxThreadTime) maxThreadTime = testData[threadIdx].timeMs;
      if (minThreadTime < 0 || testData[threadIdx].timeMs < minThreadTime) minThreadTime = testData[threadIdx].timeMs;
    }

    timeMs = end_timing();
    minTimeMet = timeMs > 2000; // see if 2 seconds will work
    toleranceMet = ((maxThreadTime - minThreadTime) / minThreadTime) < 0.2f; // allow 10% variation?

    if (!minTimeMet) {
      // Increase iteration count with 3s target
      baseIterations = scale_iterations_to_target(baseIterations, (float)timeMs, 3000.0f); 
      for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
        testData[threadIdx].iterations = baseIterations;
      }

      fprintf(stderr, "Setting %lu iterations\n", baseIterations);
    } else if (!toleranceMet) {
      for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
        testData[threadIdx].iterations = scale_iterations_to_target(
          testData[threadIdx].iterations,
          testData[threadIdx].timeMs,
          maxThreadTime);
        fprintf(stderr, "Thread %d -> %lu iterations\n", threadIdx, testData[threadIdx].iterations); 
      }
    }
  } while ((!toleranceMet) || (!minTimeMet));

  fprintf(stderr, "time elapsed: %d ms\n", timeMs);

  uint64_t totalIterations = 0;
  for (int threadIdx = 0; threadIdx < threadCount; threadIdx++) {
    totalIterations += testData[threadIdx].iterations;
  }

  free(testData);
  free(testThreads);

  return (1000 * totalIterations / timeMs) / 1e9;
}

void *TestThread(void *param) {
  struct TestThreadData *testData = (struct TestThreadData *)param;

#ifndef _MSC_VER
  if (testData->core >= 0) {
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(testData->core, &cpuset);
    sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
  }
  
  struct timeval start1;
#else
  struct timeb start1;
#endif
  start_timing_ts(&start1);
  testData->testfunc(testData->iterations, testData->testData);
  testData->timeMs = end_timing_ts(&start1);

  return NULL;
}


================================================
FILE: mt_instructionrate/mt_instructionrate.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.8.34511.84
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Project1", "Project1.vcxproj", "{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.ActiveCfg = Debug|x64
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x64.Build.0 = Debug|x64
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.ActiveCfg = Debug|Win32
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Debug|x86.Build.0 = Debug|Win32
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.ActiveCfg = Release|x64
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x64.Build.0 = Release|x64
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.ActiveCfg = Release|Win32
		{0AD46EB5-549E-4E36-9CEA-89D06CEE1B5E}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
	GlobalSection(ExtensibilityGlobals) = postSolution
		SolutionGuid = {B31B466E-F833-4B33-9E21-74616F970AA2}
	EndGlobalSection
EndGlobal


================================================
FILE: mt_instructionrate/ppc64_mt_instructionrate.c
================================================
extern uint64_t vec_int32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_int32_mul_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_add_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_fma_test(uint64_t iterations, void *data);
extern uint64_t vec_fp32_isqrt_test(uint64_t iterations, void *data);
extern uint64_t fp64_add_test(uint64_t iterations, void *data);
extern uint64_t fp64_fma_test(uint64_t iterations, void *data);

void RunTests() {
  uint64_t iterations = 3500000000;
  int testDataLength = 256; 
  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
  uint32_t *fpTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
  for (int i = 0; i < testDataLength; i++) {
    intTestArr[i] = i;
    fpTestArr[i] = i * 1.2f;
  }

  fprintf(stderr, "Measuring INT32 adds\n");
  float int32adds = measureFunction(iterations, vec_int32_add_test, intTestArr);
  float int32muls = measureFunction(iterations, vec_int32_mul_test, intTestArr);
  float fp32adds = measureFunction(iterations, vec_fp32_add_test, fpTestArr);
  float fp32fmas = measureFunction(iterations, vec_fp32_fma_test, fpTestArr);
  float fp32isqrt = measureFunction(iterations, vec_fp32_isqrt_test, fpTestArr);
  float fp64adds = measureFunction(iterations, fp64_add_test, fpTestArr);
  float fp64fmas = measureFunction(iterations, fp64_fma_test, fpTestArr);

  printf("-----GOPS/s-----\n");
  printf("Altivec INT32 Add: %f\n", int32adds); 
  printf("Altivec INT32 Multiply: %f\n", int32muls); 
  printf("Altivec FP32 Add: %f\n", fp32adds);
  printf("Altivec FP32 FMA: %f (%f GFLOPS)\n", fp32fmas, 2 * fp32fmas);
  printf("Altivec FP32 Inverse Square Root: %f\n", fp32isqrt);
  printf("FP64 Add: %f\n", fp64adds);
  printf("FP64 FMA: %f (%f GFLOPS)\n", fp64fmas, 2 * fp64fmas);
  
  free(intTestArr);
  free(fpTestArr); 
  return;
}


================================================
FILE: mt_instructionrate/ppc64_mt_instructionrate.s
================================================
.text

.global vec_int32_add_test
.global vec_int32_mul_test
.global vec_fp32_add_test
.global vec_fp32_fma_test
.global vec_fp32_isqrt_test
.global fp64_add_test
.global fp64_fma_test

/* r3 = iterations, r4 = ptr to arr */
vec_int32_add_test:
  .quad .L.vec_int32_add_test,.TOC.@tocbase,0
.L.vec_int32_add_test:
  li r9, 0
  lvx v0, r4, r9
  li r9, 16
  lvx v1, r4, r9
  li r9, 32
  lvx v2, r4, r9
  li r9, 48
  lvx v3, r4, r9
  li r9, 64
  lvx v4, r4, r9
  li r9, 80
  lvx v5, r4, r9
  li r9, 0
vec_int32_add_test_loop:
  vadduwm v0, v0, v0
  vadduwm v1, v1, v1
  vadduwm v2, v2, v2
  vadduwm v3, v3, v3
  vadduwm v4, v4, v4
  vadduwm v5, v4, v4
  addi r9, r9, 24
  cmpld cr7, r3, r9
  bgt cr7, vec_int32_add_test_loop
  blr

vec_int32_mul_test:
  .quad .L.vec_int32_mul_test,.TOC.@tocbase,0
.L.vec_int32_mul_test:
  li r9, 0
  lvx v0, r4, r9
  li r9, 16
  lvx v1, r4, r9
  li r9, 32
  lvx v2, r4, r9
  li r9, 48
  lvx v3, r4, r9
  li r9, 64
  lvx v4, r4, r9
  li r9, 80
  lvx v5, r4, r9
  li r9, 96
  lvx v6, r4, r9
  li r9, 128
  lvx v7, r4, r9
  li r9, 0
vec_int32_mul_test_loop:
  vmuleuh v0, v0, v0
  vmuleuh v1, v1, v1
  vmuleuh v2, v2, v2
  vmuleuh v3, v3, v3
  vmuleuh v4, v4, v4
  vmuleuh v5, v5, v5
  vmuleuh v6, v6, v6
  vmuleuh v7, v7, v7
  addi r9, r9, 32
  cmpld cr7, r3, r9
  bgt cr7, vec_int32_mul_test_loop
  blr 

vec_fp32_add_test:
  .quad .L.vec_fp32_add_test,.TOC.@tocbase,0
.L.vec_fp32_add_test:
  li r9, 0
  lvx v0, r4, r9
  li r9, 16
  lvx v1, r4, r9
  li r9, 32
  lvx v2, r4, r9
  li r9, 48
  lvx v3, r4, r9
  li r9, 64
  lvx v4, r4, r9
  li r9, 80
  lvx v5, r4, r9
  li r9, 96
  lvx v6, r4, r9
  li r9, 128
  lvx v7, r4, r9
  li r9, 0 
vec_fp32_add_test_loop:
  vaddfp v0, v0, v0
  vaddfp v1, v1, v1
  vaddfp v2, v2, v2
  vaddfp v3, v3, v3
  vaddfp v4, v4, v4
  vaddfp v5, v5, v5
  vaddfp v6, v6, v6
  vaddfp v7, v7, v7
  addi r9, r9, 32
  cmpld cr7, r3, r9
  bgt cr7, vec_fp32_add_test_loop
  blr  

vec_fp32_fma_test:
  .quad .L.vec_fp32_fma_test,.TOC.@tocbase,0
.L.vec_fp32_fma_test:
  li r9, 0
  lvx v0, r4, r9
  li r9, 16
  lvx v1, r4, r9
  li r9, 32
  lvx v2, r4, r9
  li r9, 48
  lvx v3, r4, r9
  li r9, 64
  lvx v4, r4, r9
  li r9, 80
  lvx v5, r4, r9
  li r9, 96
  lvx v6, r4, r9
  li r9, 128
  lvx v7, r4, r9
  li r9, 0   
vec_fp32_fma_test_loop:
  vmaddfp v0, v0, v0, v0
  vmaddfp v1, v1, v1, v1
  vmaddfp v2, v2, v2, v2
  vmaddfp v3, v3, v3, v3
  vmaddfp v4, v4, v4, v4
  vmaddfp v5, v5, v5, v5
  vmaddfp v6, v6, v6, v6
  vmaddfp v7, v7, v7, v7
  addi r9, r9, 32
  cmpld cr7, r3, r9
  bgt cr7, vec_fp32_add_test_loop
  blr   

vec_fp32_isqrt_test:
  .quad .L.vec_fp32_isqrt_test,.TOC.@tocbase,0
.L.vec_fp32_isqrt_test:
  li r9, 0
  lvx v0, r4, r9
  li r9, 16
  lvx v1, r4, r9
  li r9, 32
  lvx v2, r4, r9
  li r9, 48
  lvx v3, r4, r9
  li r9, 64
  lvx v4, r4, r9
  li r9, 80
  lvx v5, r4, r9
  li r9, 96
  lvx v6, r4, r9
  li r9, 128
  lvx v7, r4, r9
  li r9, 0  
vec_fp32_isqrt_test_loop:
  vrsqrtefp v0, v0
  vrsqrtefp v1, v1
  vrsqrtefp v2, v2
  vrsqrtefp v3, v3
  vrsqrtefp v4, v4
  vrsqrtefp v5, v5
  vrsqrtefp v6, v6
  vrsqrtefp v7, v7
  addi r9, r9, 32
  cmpld cr7, r3, r9
  bgt cr7, vec_fp32_isqrt_test_loop
  blr 

fp64_add_test:
  .quad .L.fp64_add_test,.TOC.@tocbase,0
.L.fp64_add_test:
  lfd f0, 0(r4)
  lfd f1, 8(r4)
  lfd f2, 16(r4)
  lfd f3, 24(r4)
  lfd f4, 32(r4)
  lfd f5, 40(r4)
  lfd f6, 48(r4)
  lfd f7, 56(r4)
fp64_add_test_loop:
  fadd f0, f0, f0
  fadd f1, f1, f1
  fadd f2, f2, f2
  fadd f3, f3, f3
  fadd f4, f4, f4
  fadd f5, f5, f5
  fadd f6, f6, f6
  fadd f7, f7, f7
  addi r9, r9, 8
  cmpld cr7, r3, r9
  bgt cr7, fp64_add_test_loop
  blr  

fp64_fma_test:
  .quad .L.fp64_fma_test,.TOC.@tocbase,0
.L.fp64_fma_test:
  lfd f0, 0(r4)
  lfd f1, 8(r4)
  lfd f2, 16(r4)
  lfd f3, 24(r4)
  lfd f4, 32(r4)
  lfd f5, 40(r4)
  lfd f6, 48(r4)
  lfd f7, 56(r4)
fp64_fma_test_loop:
  fmadd f0, f0, f0, f0
  fmadd f1, f1, f1, f1
  fmadd f2, f2, f2, f2
  fmadd f3, f3, f3, f3
  fmadd f4, f4, f4, f4
  fmadd f5, f5, f5, f5
  fmadd f6, f6, f6, f6
  fmadd f7, f7, f7, f7
  addi r9, r9, 8
  cmpld cr7, r3, r9
  bgt cr7, fp64_fma_test_loop
  blr   


================================================
FILE: mt_instructionrate/x86_mt_instructionrate.c
================================================
extern uint64_t sse_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t sse_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx2_int64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp32_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp32_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp32_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t sse_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp64_add_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp64_mul_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t avx_fp64_muladd_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t fp32_fma_test(uint64_t iterations, void* data) SMALLKITTEN;
extern uint64_t fp64_fma_test(uint64_t iterations, void* data) SMALLKITTEN;

extern uint64_t avx_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t sse_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;

extern uint64_t avx512_int32_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_int32_mul_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_int64_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_int64_mul_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_fp32_rsqrt_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_fp32_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_fp32_fma_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_fp64_add_test(uint64_t iterations, void *data) SMALLKITTEN;
extern uint64_t avx512_fp64_fma_test(uint64_t iterations, void *data) SMALLKITTEN;

#ifndef _MSC_VER
#include <cpuid.h>
//void __cpuidex(int *data, int function, int subfunction) {
//  int eax, ebx, ecx, edx;
//  __cpuid_count(function, subfunction, eax, ebx, ecx, edx);
//  data[0] = eax;
//  data[1] = ebx;
//  data[2] = ecx;
//  data[3] = edx;
//}
#endif


void RunTests() {
  int cpuid_data[4];
  int avx_supported = 0, avx2_supported = 0, avx512_supported = 0, fma_supported = 0;
  uint64_t iterations = 5500000000;
  int testDataLength = 512; 
  uint32_t *intTestArr = (uint32_t *)malloc(sizeof(uint32_t) * testDataLength);
  uint64_t* int64TestArr = (uint64_t*)malloc(sizeof(uint64_t) * testDataLength);
  float *fpTestArr = (float *)malloc(sizeof(uint32_t) * testDataLength);
  double* fp64TestArr = (double*)malloc(sizeof(double) * testDataLength);
  for (int i = 0; i < testDataLength; i++) {
    intTestArr[i] = i;
    int64TestArr[i] = i * 2;
    fpTestArr[i] = 1.0f + 0.02f * i;
    fp64TestArr[i] = 2.0f + 0.01f * i;
  }

  __cpuidex(cpuid_data, 1, 0);
  if (cpuid_data[2] & (1UL << 28)) {
      fprintf(stderr, "AVX supported\n");
      avx_supported = 1;
  }

  if (cpuid_data[2] & (1UL << 28)) {
      fprintf(stderr, "AVX2 supported\n");
      avx2_supported = 1;
  }

  if (cpuid_data[2] & (1UL << 12)) {
      fprintf(stderr, "FMA supported\n");
      fma_supported = 1;
  }

  __cpuidex(cpuid_data, 7, 0);
  if (cpuid_data[1] & (1UL << 16)) {
      fprintf(stderr, "AVX512 supported\n");
      avx512_supported = 1;
  }

  fprintf(stderr, "Measuring INT32 adds with SSE\n");
  float sseInt32Adds = measureFunction(iterations, sse_int32_add_test, intTestArr);
  float sseInt32Muls = measureFunction(iterations, sse_int32_mul_test, intTestArr);
  float sseInt64Adds = measureFunction(iterations, sse_int64_add_test, intTestArr);
  float sseInt64Muls = measureFunction(iterations, sse_int64_mul_test, intTestArr);
  float sseFp32Adds = measureFunction(iterations, sse_fp32_add_test, fpTestArr);
  float sseFp32Muls = measureFunction(iterations, sse_fp32_mul_test, fpTestArr);
  float sseFp32MulAdds = measureFunction(iterations, sse_fp32_muladd_test, fpTestArr);
  float sseFp64Adds = measureFunction(iterations, sse_fp64_add_test, fp64TestArr);
  float sseFp64Muls = measureFunction(iterations, sse_fp64_mul_test, fp64TestArr);
  float sseFp64Muladds = measureFunction(iterations, sse_fp64_muladd_test, fp64TestArr);
  float sseFp32Rsqrts = measureFunction(iterations, sse_fp32_rsqrt_test, fpTestArr);

  float avx2Int32Adds, avx2Int32Muls, avx2Int64Adds, avx2Int64Muls;
  if (avx2_supported) {
      avx2Int32Adds = measureFunction(iterations, avx2_int32_add_test, intTestArr);
      avx2Int32Muls = measureFunction(iterations, avx2_int32_mul_test, intTestArr);
      avx2Int64Adds = measureFunction(iterations, avx2_int64_add_test, int64TestArr);
      avx2Int64Muls = measureFunction(iterations, avx2_int64_mul_test, int64TestArr);
  }

  float avxFp32Adds, avxFp32Muls, avxFp32Muladds, avxFp64Adds, avxFp64Muls, avxFp64Muladds;
  float avxFp32Rsqrts;
  if (avx_supported)
  {
      avxFp32Adds = measureFunction(iterations, avx_fp32_add_test, fpTestArr);
      avxFp32Muls = measureFunction(iterations, avx_fp32_mul_test, fpTestArr);
      avxFp32Muladds = measureFunction(iterations, avx_fp32_muladd_test, fpTestArr);
      avxFp64Adds = measureFunction(iterations, avx_fp64_add_test, fp64TestArr);
      avxFp64Muls = measureFunction(iterations, avx_fp64_mul_test, fp64TestArr);
      avxFp64Muladds = measureFunction(iterations, avx_fp64_muladd_test, fp64TestArr);
      avxFp32Rsqrts = measureFunction(iterations, avx_fp32_rsqrt_test, fpTestArr);
  }

  float fmaFp32, fmaFp64;
  if (fma_supported) {
      fmaFp32 = measureFunction(iterations, fp32_fma_test, fpTestArr);
      fmaFp64 = measureFunction(iterations, fp64_fma_test, fpTestArr);
  }

  float avx512Fp32Rsqrts, avx512Fp32Adds, avx512Fp32Fmas, avx512Fp64Adds, avx512Fp64Fmas;
  float avx512Int32Adds, avx512Int32Muls, avx512Int64Adds, avx512Int64Muls;
  if (avx512_supported) {
    avx512Fp32Rsqrts = measureFunction(iterations, avx512_fp32_rsqrt_test, fpTestArr);
    avx512Fp32Adds = measureFunction(iterations, avx512_fp32_add_test, fpTestArr);
    avx512Fp32Fmas = measureFunction(iterations, avx512_fp32_fma_test, fpTestArr);
    avx512Fp64Adds = measureFunction(iterations, avx512_fp64_add_test, fp64TestArr);
    avx512Fp64Fmas = measureFunction(iterations, avx512_fp64_fma_test, fp64TestArr);
    avx512Int32Adds = measureFunction(iterations, avx512_int32_add_test, intTestArr);
    avx512Int32Muls = measureFunction(iterations, avx512_int32_mul_test, intTestArr);
    avx512Int64Adds = measureFunction(iterations, avx512_int64_add_test, int64TestArr);
    avx512Int64Muls = measureFunction(iterations, avx512_int64_mul_test, int64TestArr);
  }

  printf("\n-----GOPS/s-----\n");

  // INT32
  printf("\n-----INT32-----\n");
  printf("SSE INT32 Adds: %f\n", sseInt32Adds);
  if (avx2_supported) printf("AVX2 INT32 Adds: %f\n", avx2Int32Adds);
  if (avx512_supported) printf("AVX512 INT32 Adds: %f\n", avx512Int32Adds);
  printf("SSE INT32 Multiplies: %f\n", sseInt32Muls);
  if (avx2_supported) printf("AVX2 INT32 Multiplies: %f\n", avx2Int32Muls);
  if (avx512_supported) printf("AVX512 INT32 Multiplies: %f\n", avx512Int32Muls);

  // FP32
  printf("\n-----FP32-----\n");
  printf("SSE FP32 Adds: %f\n", sseFp32Adds);
  if (avx_supported) printf("AVX FP32 Adds: %f\n", avxFp32Adds);
  if (avx512_supported) printf("AVX512 FP32 Adds: %f\n", avx512Fp32Adds);
  printf("SSE FP32 Multiplies: %f\n", sseFp32Muls);
  if (avx_supported) printf("AVX FP32 Multiplies: %f\n", avxFp32Muls);
  printf("SSE FP32 Multiply+Adds: %f\n", sseFp32MulAdds);
  if (avx_supported) printf("AVX FP32 Multiply+Adds: %f (%f GFLOPS)\n", avxFp32Muladds, 2 * avxFp32Muladds);
  if (fma_supported) printf("FP32 FMAs: %f (%f GFLOPS)\n", fmaFp32, 2 * fmaFp32);
  if (avx512_supported) printf("AVX512 FP32 FMAs: %f (%f GFLOPS)\n", avx512Fp32Fmas, avx512Fp32Fmas * 2);
  printf("SSE FP32 Inverse Square Roots: %f\n", sseFp32Rsqrts);
  if (avx_supported) printf("AVX FP32 Inverse Square Roots: %f\n", avxFp32Rsqrts);
  if (avx512_supported) printf("AVX512 FP32 Inverse Square Roots: %f\n", avx512Fp32Rsqrts);
  
  // INT64
  printf("\n-----INT64-----\n");
  printf("SSE INT64 Adds: %f\n", sseInt64Adds);
  if (avx2_supported) printf("AVX2 INT64 Adds: %f\n", avx2Int64Adds);
  if (avx512_supported) printf("AVX512 INT64 Adds: %f\n", avx512Int64Adds);
  printf("SSE INT64 Multiplies: %f\n", sseInt64Muls);
  if (avx2_supported) printf("AVX2 INT64 Multiplies: %f\n", avx2Int64Muls);
  if (avx512_supported) printf("AVX512 INT64 Multiplies: %f\n", avx512Int64Muls);

  // FP64
  printf("\n-----FP64-----\n");
  printf("SSE FP64 Adds: %f\n", sseFp64Adds);
  if (avx_supported) printf("AVX FP64 Adds: %f\n", avxFp64Adds);
  if (avx512_supported) printf("AVX512 FP64 Adds: %f\n", avx512Fp64Adds);
  printf("SSE FP64 Multiplies: %f\n", sseFp64Muls);
  if (avx_supported) printf("AVX FP64 Multiplies: %f\n", avxFp64Muls);
  printf("SSE FP64 Multiply+Adds: %f (%f GFLOPS)\n", sseFp64Muladds, 2 * sseFp64Muladds);
  if (avx_supported) printf("AVX FP64 Multiply+Adds: %f (%f GFLOPS)\n", avxFp64Muladds, 2 * avxFp64Muladds);
  if (fma_supported) printf("AVX FP64 FMAs: %f (%f GFLOPS)\n", fmaFp64, 2 * fmaFp64);
  if (avx512_supported) printf("AVX512 FP64 FMAs: %f (%f GFLOPS)\n", avx512Fp64Fmas, avx512Fp64Fmas * 2);

  free(intTestArr);
  free(fpTestArr);
  return;
}


================================================
FILE: mt_instructionrate/x86_mt_instructionrate.s
================================================
.intel_syntax noprefix
.text

.global sse_int32_add_test
.global sse_int32_mul_test
.global sse_int64_add_test
.global sse_int64_mul_test
.global avx2_int32_add_test
.global avx2_int32_mul_test
.global avx2_int64_add_test
.global avx2_int64_mul_test
.global sse_fp32_add_test
.global sse_fp32_mul_test
.global sse_fp32_muladd_test
.global sse_fp32_rsqrt_test
.global avx_fp32_add_test
.global avx_fp32_mul_test
.global avx_fp32_muladd_test
.global avx_fp32_rsqrt_test
.global fp32_fma_test
.global fp64_fma_test

.global sse_fp64_add_test
.global sse_fp64_mul_test
.global sse_fp64_muladd_test
.global avx_fp64_add_test
.global avx_fp64_mul_test
.global avx_fp64_muladd_test

.global avx512_int32_add_test
.global avx512_int32_mul_test
.global avx512_int64_add_test
.global avx512_int64_mul_test
.global avx512_fp32_rsqrt_test
.global avx512_fp32_add_test
.global avx512_fp32_fma_test
.global avx512_fp64_add_test
.global avx512_fp64_fma_test 

sse_int32_add_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int32_add_test_loop:
  paddd xmm0, xmm0
  paddd xmm1, xmm1
  paddd xmm2, xmm2
  paddd xmm3, xmm3
  paddd xmm4, xmm4
  paddd xmm5, xmm5
  sub rcx, 24
  cmp rcx, 0
  jg sse_int32_add_test_loop
  ret

sse_int64_add_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int64_add_test_loop:
  paddq xmm0, xmm0
  paddq xmm1, xmm1
  paddq xmm2, xmm2
  paddq xmm3, xmm3
  paddq xmm4, xmm4
  paddq xmm5, xmm5
  sub rcx, 12
  jg sse_int64_add_test_loop
  ret

sse_int32_mul_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int32_mul_test_loop:
  pmulld xmm0, xmm0
  pmulld xmm1, xmm1
  pmulld xmm2, xmm2
  pmulld xmm3, xmm3
  pmulld xmm4, xmm4
  pmulld xmm5, xmm5
  sub rcx, 24
  jg sse_int32_mul_test_loop
  ret

sse_int64_mul_test:
  movdqu xmm0, [rdx]
  movdqu xmm1, [rdx + 16]
  movdqu xmm2, [rdx + 32]
  movdqu xmm3, [rdx + 48]
  movdqu xmm4, [rdx + 64]
  movdqu xmm5, [rdx + 72]
sse_int64_mul_test_loop:
  pmuludq xmm0, xmm0
  pmuludq xmm1, xmm1
  pmuludq xmm2, xmm2
  pmuludq xmm3, xmm3
  pmuludq xmm4, xmm4
  pmuludq xmm5, xmm5
  sub rcx, 12
  jg sse_int64_mul_test_loop
  ret

avx2_int32_add_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int32_add_test_loop:
  vpaddd ymm0, ymm0, ymm0
  vpaddd ymm1, ymm1, ymm1
  vpaddd ymm2, ymm2, ymm2
  vpaddd ymm3, ymm3, ymm3
  vpaddd ymm4, ymm4, ymm4
  vpaddd ymm5, ymm5, ymm5
  sub rcx, 48
  cmp rcx, 0
  jg avx2_int32_add_test_loop
  ret

avx2_int32_mul_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int32_mul_test_loop:
  vpmulld ymm0, ymm0, ymm0
  vpmulld ymm1, ymm1, ymm1
  vpmulld ymm2, ymm2, ymm2
  vpmulld ymm3, ymm3, ymm3
  vpmulld ymm4, ymm4, ymm4
  vpmulld ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx2_int32_mul_test_loop
  ret

avx2_int64_add_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int64_add_test_loop:
  vpaddq ymm0, ymm0, ymm0
  vpaddq ymm1, ymm1, ymm1
  vpaddq ymm2, ymm2, ymm2
  vpaddq ymm3, ymm3, ymm3
  vpaddq ymm4, ymm4, ymm4
  vpaddq ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx2_int64_add_test_loop
  ret

avx2_int64_mul_test:
  vmovdqu ymm0, [rdx]
  vmovdqu ymm1, [rdx + 32]
  vmovdqu ymm2, [rdx + 64]
  vmovdqu ymm3, [rdx + 96]
  vmovdqu ymm4, [rdx + 128]
  vmovdqu ymm5, [rdx + 160]
avx2_int64_mul_test_loop:
  vpmuldq ymm0, ymm0, ymm0
  vpmuldq ymm1, ymm1, ymm1
  vpmuldq ymm2, ymm2, ymm2
  vpmuldq ymm3, ymm3, ymm3
  vpmuldq ymm4, ymm4, ymm4
  vpmuldq ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx2_int64_mul_test_loop
  ret

sse_fp32_add_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_add_test_loop:
  addps xmm0, xmm0
  addps xmm1, xmm1
  addps xmm2, xmm2
  addps xmm3, xmm3
  addps xmm4, xmm4
  addps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_add_test_loop
  ret

sse_fp64_add_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_add_test_loop:
  addpd xmm0, xmm0
  addpd xmm1, xmm1
  addpd xmm2, xmm2
  addpd xmm3, xmm3
  addpd xmm4, xmm4
  addpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_add_test_loop
  ret

sse_fp32_mul_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_mul_test_loop:
  mulps xmm0, xmm0
  mulps xmm1, xmm1
  mulps xmm2, xmm2
  mulps xmm3, xmm3
  mulps xmm4, xmm4
  mulps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_mul_test_loop
  ret

sse_fp64_mul_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_mul_test_loop:
  mulpd xmm0, xmm0
  mulpd xmm1, xmm1
  mulpd xmm2, xmm2
  mulpd xmm3, xmm3
  mulpd xmm4, xmm4
  mulpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_mul_test_loop
  ret

sse_fp32_muladd_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_muladd_test_loop:
  mulps xmm0, xmm0
  addps xmm0, xmm0
  mulps xmm1, xmm1
  addps xmm1, xmm1
  mulps xmm2, xmm2
  addps xmm2, xmm2
  mulps xmm3, xmm3
  addps xmm3, xmm3
  mulps xmm4, xmm4
  addps xmm4, xmm4
  mulps xmm5, xmm5
  addps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_muladd_test_loop
  ret

sse_fp32_rsqrt_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp32_rsqrt_test_loop:
  rsqrtps xmm0, xmm0
  rsqrtps xmm1, xmm1
  rsqrtps xmm2, xmm2
  rsqrtps xmm3, xmm3
  rsqrtps xmm4, xmm4
  rsqrtps xmm5, xmm5
  sub rcx, 24
  jg sse_fp32_rsqrt_test_loop
  ret

avx_fp32_rsqrt_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_rsqrt_test_loop:
  vrsqrtps ymm0, ymm0
  vrsqrtps ymm1, ymm1
  vrsqrtps ymm2, ymm2
  vrsqrtps ymm3, ymm3
  vrsqrtps ymm4, ymm4
  vrsqrtps ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_rsqrt_test_loop
  ret

sse_fp64_muladd_test:
  movups xmm0, [rdx]
  movups xmm1, [rdx + 16]
  movups xmm2, [rdx + 32]
  movups xmm3, [rdx + 48]
  movups xmm4, [rdx + 64]
  movups xmm5, [rdx + 72]
sse_fp64_muladd_test_loop:
  mulpd xmm0, xmm0
  addpd xmm0, xmm0
  mulpd xmm1, xmm1
  addpd xmm1, xmm1
  mulpd xmm2, xmm2
  addpd xmm2, xmm2
  mulpd xmm3, xmm3
  addpd xmm3, xmm3
  mulpd xmm4, xmm4
  addpd xmm4, xmm4
  mulpd xmm5, xmm5
  addpd xmm5, xmm5
  sub rcx, 12
  jg sse_fp64_muladd_test_loop
  ret

avx_fp32_add_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_add_test_loop:
  vaddps ymm0, ymm0, ymm0
  vaddps ymm1, ymm1, ymm1
  vaddps ymm2, ymm2, ymm2
  vaddps ymm3, ymm3, ymm3
  vaddps ymm4, ymm4, ymm4
  vaddps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_add_test_loop
  ret

avx_fp64_add_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_add_test_loop:
  vaddpd ymm0, ymm0, ymm0
  vaddpd ymm1, ymm1, ymm1
  vaddpd ymm2, ymm2, ymm2
  vaddpd ymm3, ymm3, ymm3
  vaddpd ymm4, ymm4, ymm4
  vaddpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_add_test_loop
  ret

avx_fp32_mul_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_mul_test_loop:
  vmulps ymm0, ymm0, ymm0
  vmulps ymm1, ymm1, ymm1
  vmulps ymm2, ymm2, ymm2
  vmulps ymm3, ymm3, ymm3
  vmulps ymm4, ymm4, ymm4
  vmulps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_mul_test_loop
  ret

avx_fp64_mul_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_mul_test_loop:
  vmulpd ymm0, ymm0, ymm0
  vmulpd ymm1, ymm1, ymm1
  vmulpd ymm2, ymm2, ymm2
  vmulpd ymm3, ymm3, ymm3
  vmulpd ymm4, ymm4, ymm4
  vmulpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_mul_test_loop
  ret

avx_fp32_muladd_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp32_muladd_test_loop:
  vmulps ymm0, ymm0, ymm0
  vaddps ymm0, ymm0, ymm0
  vmulps ymm1, ymm1, ymm1
  vaddps ymm1, ymm1, ymm1
  vmulps ymm2, ymm2, ymm2
  vaddps ymm2, ymm2, ymm2
  vmulps ymm3, ymm3, ymm3
  vaddps ymm3, ymm3, ymm3
  vmulps ymm4, ymm4, ymm4
  vaddps ymm4, ymm4, ymm4
  vmulps ymm5, ymm5, ymm5
  vaddps ymm5, ymm5, ymm5
  sub rcx, 48
  jg avx_fp32_muladd_test_loop
  ret

avx_fp64_muladd_test:
  vmovups ymm0, [rdx]
  vmovups ymm1, [rdx + 32]
  vmovups ymm2, [rdx + 64]
  vmovups ymm3, [rdx + 96]
  vmovups ymm4, [rdx + 128]
  vmovups ymm5, [rdx + 160]
avx_fp64_muladd_test_loop:
  vmulpd ymm0, ymm0, ymm0
  vaddpd ymm0, ymm0, ymm0
  vmulpd ymm1, ymm1, ymm1
  vaddpd ymm1, ymm1, ymm1
  vmulpd ymm2, ymm2, ymm2
  vaddpd ymm2, ymm2, ymm2
  vmulpd ymm3, ymm3, ymm3
  vaddpd ymm3, ymm3, ymm3
  vmulpd ymm4, ymm4, ymm4
  vaddpd ymm4, ymm4, ymm4
  vmulpd ymm5, ymm5, ymm5
  vaddpd ymm5, ymm5, ymm5
  sub rcx, 24
  jg avx_fp64_muladd_test_loop
  ret

fp32_fma_test:
  vzeroall
  vmovups ymm0, [rdx]
  vmovups ymm1, ymm0
  vmovups ymm2, ymm0
  vmovups ymm3, ymm0
  vmovups ymm4, ymm0
  vmovups ymm5, ymm0
  vmovups ymm6, ymm0
fp32_fma_test_loop:
  vfmadd132ps ymm0, ymm0, ymm6
  vfmadd132ps ymm1, ymm1, ymm6
  vfmadd132ps ymm2, ymm2, ymm6
  vfmadd132ps ymm3, ymm3, ymm6
  vfmadd132ps ymm4, ymm4, ymm6
  vfmadd132ps ymm5, ymm5, ymm6
  sub rcx, 48
  jg fp32_fma_test_loop
  ret

fp64_fma_test:
  vzeroall
  vmovups ymm0, [rdx]
  vmovups ymm1, ymm0
  vmovups ymm2, ymm0
  vmovups ymm3, ymm0
  vmovups ymm4, ymm0
  vmovups ymm5, ymm0
  vmovups ymm6, ymm0
fp64_fma_test_loop:
  vfmadd132pd ymm0, ymm0, ymm6
  vfmadd132pd ymm1, ymm1, ymm6
  vfmadd132pd ymm2, ymm2, ymm6
  vfmadd132pd ymm3, ymm3, ymm6
  vfmadd132pd ymm4, ymm4, ymm6
  vfmadd132pd ymm5, ymm5, ymm6
  sub rcx, 24
  jg fp64_fma_test_loop
  ret

avx512_int32_add_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int32_add_test_loop:
  vpaddd zmm0, zmm0, zmm0
  vpaddd zmm1, zmm1, zmm1
  vpaddd zmm2, zmm2, zmm2
  vpaddd zmm3, zmm3, zmm3
  vpaddd zmm4, zmm4, zmm4
  vpaddd zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_int32_add_test_loop
  ret

avx512_int32_mul_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int32_mul_test_loop:
  vpmulld zmm0, zmm0, zmm0
  vpmulld zmm1, zmm1, zmm1
  vpmulld zmm2, zmm2, zmm2
  vpmulld zmm3, zmm3, zmm3
  vpmulld zmm4, zmm4, zmm4
  vpmulld zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_int32_mul_test_loop
  ret

avx512_int64_add_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int64_add_test_loop:
  vpaddq zmm0, zmm0, zmm0
  vpaddq zmm1, zmm1, zmm1
  vpaddq zmm2, zmm2, zmm2
  vpaddq zmm3, zmm3, zmm3
  vpaddq zmm4, zmm4, zmm4
  vpaddq zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_int64_add_test_loop
  ret

avx512_int64_mul_test:
  vmovdqu16 zmm0, [rdx]
  vmovdqu16 zmm1, [rdx + 64]
  vmovdqu16 zmm2, [rdx + 128]
  vmovdqu16 zmm3, [rdx + 192]
  vmovdqu16 zmm4, [rdx + 256]
  vmovdqu16 zmm5, [rdx + 384]
avx512_int64_mul_test_loop:
  vpmuldq zmm0, zmm0, zmm0
  vpmuldq zmm1, zmm1, zmm1
  vpmuldq zmm2, zmm2, zmm2
  vpmuldq zmm3, zmm3, zmm3
  vpmuldq zmm4, zmm4, zmm4
  vpmuldq zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_int64_mul_test_loop
  ret

avx512_fp32_rsqrt_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_rsqrt_test_loop:
  vrsqrt14ps zmm0, zmm0
  vrsqrt14ps zmm1, zmm1
  vrsqrt14ps zmm2, zmm2
  vrsqrt14ps zmm3, zmm3
  vrsqrt14ps zmm4, zmm4
  vrsqrt14ps zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_rsqrt_test_loop
  ret

avx512_fp32_add_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_add_test_loop:
  vaddps zmm0, zmm0, zmm0
  vaddps zmm1, zmm1, zmm1
  vaddps zmm2, zmm2, zmm2
  vaddps zmm3, zmm3, zmm3
  vaddps zmm4, zmm4, zmm4
  vaddps zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_add_test_loop
  ret

avx512_fp32_fma_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp32_fma_test_loop:
  vfmadd132ps zmm0, zmm0, zmm0
  vfmadd132ps zmm1, zmm1, zmm1
  vfmadd132ps zmm2, zmm2, zmm2
  vfmadd132ps zmm3, zmm3, zmm3
  vfmadd132ps zmm4, zmm4, zmm4
  vfmadd132ps zmm5, zmm5, zmm5
  sub rcx, 96
  jg avx512_fp32_fma_test_loop
  ret

avx512_fp64_add_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp64_add_test_loop:
  vfmadd132pd zmm0, zmm0, zmm0
  vfmadd132pd zmm1, zmm1, zmm1
  vfmadd132pd zmm2, zmm2, zmm2
  vfmadd132pd zmm3, zmm3, zmm3
  vfmadd132pd zmm4, zmm4, zmm4
  vfmadd132pd zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_fp64_add_test_loop
  ret

avx512_fp64_fma_test:
  vmovups zmm0, [rdx]
  vmovups zmm1, [rdx + 64]
  vmovups zmm2, [rdx + 128]
  vmovups zmm3, [rdx + 192]
  vmovups zmm4, [rdx + 256]
  vmovups zmm5, [rdx + 384]
avx512_fp64_fma_test_loop:
  vfmadd132ps zmm0, zmm0, zmm0
  vfmadd132ps zmm1, zmm1, zmm1
  vfmadd132ps zmm2, zmm2, zmm2
  vfmadd132ps zmm3, zmm3, zmm3
  vfmadd132ps zmm4, zmm4, zmm4
  vfmadd132ps zmm5, zmm5, zmm5
  sub rcx, 48
  jg avx512_fp64_fma_test_loop
  ret 


================================================
FILE: svm/OpenCL/include/CL/Utils/Context.h
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

// OpenCL includes
#include <CL/cl.h>

// STL includes
#include <time.h>

UTILS_EXPORT
cl_context cl_util_get_context(const cl_uint plat_id, const cl_uint dev_id,
                               const cl_device_type type, cl_int* const error);
UTILS_EXPORT
cl_device_id cl_util_get_device(const cl_uint plat_id, const cl_uint dev_id,
                                const cl_device_type type, cl_int* const error);

UTILS_EXPORT
cl_int cl_util_print_device_info(const cl_device_id device);

UTILS_EXPORT
char* cl_util_get_device_info(const cl_device_id device,
                              const cl_device_info info, cl_int* const error);
UTILS_EXPORT
char* cl_util_get_platform_info(const cl_platform_id platform,
                                const cl_platform_info info,
                                cl_int* const error);

// build program and show log if build is not successful
UTILS_EXPORT
cl_int cl_util_build_program(const cl_program pr, const cl_device_id dev,
                             const char* const opt);

#define GET_CURRENT_TIMER(time)                                                \
    struct timespec time;                                                      \
    timespec_get(&time, TIME_UTC);                                             \
    {                                                                          \
    }

#define TIMER_DIFFERENCE(dt, time1, time2)                                     \
    {                                                                          \
        dt = (time2.tv_sec - time1.tv_sec) * 1000000000                        \
            + (time2.tv_nsec - time1.tv_nsec);                                 \
    }

#define START_TIMER GET_CURRENT_TIMER(start_timer1)
#define STOP_TIMER(dt)                                                         \
    GET_CURRENT_TIMER(stop_timer2)                                             \
    TIMER_DIFFERENCE(dt, start_timer1, stop_timer2)


================================================
FILE: svm/OpenCL/include/CL/Utils/Context.hpp
================================================
#pragma once

// OpenCL SDK includes
#include "OpenCLUtilsCpp_Export.h"

#include <CL/Utils/Error.hpp>

// OpenCL includes
#include <CL/opencl.hpp>

namespace cl {
namespace util {
    Context UTILSCPP_EXPORT get_context(cl_uint plat_id, cl_uint dev_id,
                                        cl_device_type type,
                                        cl_int* error = nullptr);

    void UTILSCPP_EXPORT print_device_info(const cl::Device& device);
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/Detail.hpp
================================================
#pragma once

// STL includes
#include <stddef.h>
#include <utility> // std::forward, std::integer_sequence
#include <tuple> // std::tuple, std::get
#include <initializer_list> // std::initializer_list

namespace cl {
namespace util {
    namespace detail {
        // Borrowed from:
        // https://www.fluentcpp.com/2019/03/05/for_each_arg-applying-a-function-to-each-argument-of-a-function-in-cpp/
        template <class F, class... Args> F for_each_arg(F f, Args&&... args)
        {
            (void)std::initializer_list<int>{ (
                (void)f(std::forward<Args>(args)), 0)... };
            return f;
        }

        namespace impl {
            // Borrowed from: https://stackoverflow.com/a/16387374/1476661
            template <typename T, typename F, int... Is>
            void for_each_in_tuple(T&& t, F&& f,
                                   std::integer_sequence<int, Is...>)
            {
                auto l = {
                    (std::forward<F>(f)(std::get<Is>(std::forward<T>(t))), 0)...
                };
                (void)l;
            }
        }
        template <typename... Ts, typename F>
        void for_each_in_tuple(std::tuple<Ts...> const& t, F&& f)
        {
            impl::for_each_in_tuple(
                t, std::forward<F>(f),
                std::make_integer_sequence<int, sizeof...(Ts)>());
        }

        namespace impl {
            // Borrowed from
            // https://codereview.stackexchange.com/questions/193420/apply-a-function-to-each-element-of-a-tuple-map-a-tuple
            template <class F, typename Tuple, std::size_t... Is>
            auto transform_tuple(Tuple&& t, F&& f, std::index_sequence<Is...>)
            {
                return std::make_tuple(std::forward<F>(f)(std::get<Is>(t))...);
            }
        }
        template <class F, typename... Args>
        auto transform_tuple(const std::tuple<Args...>& t, F&& f)
        {
            return impl::transform_tuple(
                t, std::forward<F>(f),
                std::make_index_sequence<sizeof...(Args)>{});
        }

        namespace impl {
            // Borrowed from
            // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2013/n3658.html
            // with modifications of Casey Carter at
            // https://stackoverflow.com/a/51365112/1476661
            template <typename F, typename Tuple, std::size_t... I>
            auto apply(F&& f, Tuple&& args, std::index_sequence<I...>)
                -> decltype(std::forward<F>(f)(
                    std::get<I>(std::forward<Tuple>(args))...))
            {
                return std::forward<F>(f)(
                    std::get<I>(std::forward<Tuple>(args))...);
            }
        }
        template <typename F, typename Tuple,
                  typename Indices = std::make_index_sequence<
                      std::tuple_size<std::remove_reference_t<Tuple>>::value>>
        auto apply(F&& f, Tuple&& args)
            -> decltype(impl::apply(std::forward<F>(f),
                                    std::forward<Tuple>(args), Indices()))
        {
            return impl::apply(std::forward<F>(f), std::forward<Tuple>(args),
                               Indices());
        }
    }
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/Device.hpp
================================================
#pragma once

#include "OpenCLUtilsCpp_Export.h"
#include <CL/Utils/Error.hpp>

#include <CL/opencl.hpp>

namespace cl {
namespace util {
    bool UTILSCPP_EXPORT opencl_c_version_contains(
        const cl::Device& device, const cl::string& version_fragment);

    bool UTILSCPP_EXPORT supports_extension(const cl::Device& device,
                                            const cl::string& extension);

#ifdef CL_VERSION_3_0
    bool UTILSCPP_EXPORT supports_feature(const cl::Device& device,
                                          const cl::string& feature_name);
#endif
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/Error.h
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

// OpenCL Utils includes
#include <CL/Utils/ErrorCodes.h>

// STL includes
#include <stdio.h> // fprintf

// OpenCL includes
#include <CL/cl.h>

// RET = function returns error code
// PAR = functions sets error code in the paremeter

#ifdef _DEBUG

#define OCLERROR_RET(func, err, label)                                         \
    do                                                                         \
    {                                                                          \
        err = func;                                                            \
        if (err != CL_SUCCESS)                                                 \
        {                                                                      \
            cl_util_print_error(err);                                          \
            fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__,          \
                    __FILE__, #func);                                          \
            goto label;                                                        \
        }                                                                      \
    } while (0)

#define OCLERROR_PAR(func, err, label)                                         \
    do                                                                         \
    {                                                                          \
        func;                                                                  \
        if (err != CL_SUCCESS)                                                 \
        {                                                                      \
            cl_util_print_error(err);                                          \
            fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__,          \
                    __FILE__, #func);                                          \
            goto label;                                                        \
        }                                                                      \
    } while (0)

#define MEM_CHECK(func, err, label)                                            \
    do                                                                         \
    {                                                                          \
        if ((func) == NULL)                                                    \
        {                                                                      \
            err = CL_OUT_OF_HOST_MEMORY;                                       \
            cl_util_print_error(err);                                          \
            fprintf(stderr, "on line %d, in file %s\n%s\n", __LINE__,          \
                    __FILE__, #func);                                          \
            goto label;                                                        \
        }                                                                      \
    } while (0)

#else

#define OCLERROR_RET(func, err, label)                                         \
    do                                                                         \
    {                                                                          \
        err = func;                                                            \
        if (err != CL_SUCCESS) goto label;                                     \
    } while (0)

#define OCLERROR_PAR(func, err, label)                                         \
    do                                                                         \
    {                                                                          \
        func;                                                                  \
        if (err != CL_SUCCESS) goto label;                                     \
    } while (0)

#define MEM_CHECK(func, err, label)                                            \
    do                                                                         \
    {                                                                          \
        if ((func) == NULL)                                                    \
        {                                                                      \
            err = CL_OUT_OF_HOST_MEMORY;                                       \
            goto label;                                                        \
        }                                                                      \
    } while (0)

#endif

UTILS_EXPORT
void cl_util_print_error(cl_int error);


================================================
FILE: svm/OpenCL/include/CL/Utils/Error.hpp
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtilsCpp_Export.h"

// OpenCL Utils includes
#include <CL/Utils/ErrorCodes.h>

// OpenCL includes
#include <CL/opencl.hpp>

namespace cl {
namespace util {
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    /*! \brief Exception class
     *
     *  This may be thrown by SDK utility functions when
     * CL_HPP_ENABLE_EXCEPTIONS is defined.
     */
    class Error : public std::exception {
    private:
        int err_;
        const char* errStr_;

    public:
        /*! \brief Create a new SDK error exception for a given error code
         *  and corresponding message.
         *
         *  \param err error code value.
         *
         *  \param errStr a descriptive string that must remain in scope until
         *                handling of the exception has concluded.  If set, it
         *                will be returned by what().
         */
        Error(cl_int err, const char* errStr = NULL): err_(err), errStr_(errStr)
        {}

        ~Error() noexcept {}

        /*! \brief Get error string associated with exception
         *
         * \return A memory pointer to the error message string.
         */
        virtual const char* what() const noexcept
        {
            if (errStr_ == NULL)
            {
                return "empty";
            }
            else
            {
                return errStr_;
            }
        }

        /*! \brief Get error code associated with exception
         *
         *  \return The error code.
         */
        cl_int err(void) const { return err_; }
    };
#endif

    namespace detail {
        UTILSCPP_EXPORT cl_int errHandler(cl_int err, cl_int* errPtr,
                                          const char* errStr = nullptr);
    }

}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/ErrorCodes.h
================================================
#pragma once

#define CL_UTIL_INDEX_OUT_OF_RANGE -2000
#define CL_UTIL_DEVICE_NOT_INTEROPERABLE -2001
#define CL_UTIL_FILE_OPERATION_ERROR -2002


================================================
FILE: svm/OpenCL/include/CL/Utils/Event.h
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

// OpenCL includes
#include <CL/cl.h>

UTILS_EXPORT
cl_ulong cl_util_get_event_duration(const cl_event event,
                                    const cl_profiling_info start,
                                    const cl_profiling_info end,
                                    cl_int* const error);


================================================
FILE: svm/OpenCL/include/CL/Utils/Event.hpp
================================================
#pragma once

// OpenCL SDK includes
#include "OpenCLUtilsCpp_Export.h"

// STL includes
#include <chrono>

// OpenCL includes
#include <CL/opencl.hpp>

namespace cl {
namespace util {
    template <cl_int From, cl_int To, typename Dur = std::chrono::nanoseconds>
    auto get_duration(cl::Event& ev)
    {
        return std::chrono::duration_cast<Dur>(std::chrono::nanoseconds{
            ev.getProfilingInfo<To>() - ev.getProfilingInfo<From>() });
    }
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/File.h
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

// OpenCL includes
#include <CL/cl.h>

// read all the text file contents securely in ANSI C89
// return pointer to C-string with file contents
// can handle streams with no known size and no support for fseek
// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal
UTILS_EXPORT
char* cl_util_read_text_file(const char* const filename, size_t* const length,
                             cl_int* const error);

// read all the binary file contents securely in ANSI C89
// return pointer to file contents
// can handle streams with no known size and no support for fseek
// based on https://stackoverflow.com/questions/14002954/ by Nominal Animal
UTILS_EXPORT
unsigned char* cl_util_read_binary_file(const char* const filename,
                                        size_t* const length,
                                        cl_int* const error);

// write binaries of OpenCL compiled program
// binaries are written as separate files for each device
// with file name "(program_file_name)_(name of device).bin"
// based on variant of Logan
// http://logan.tw/posts/2014/11/22/pre-compile-the-opencl-kernel-program-part-2/
UTILS_EXPORT
cl_int cl_util_write_binaries(const cl_program program,
                              const char* const program_file_name);

// read binaries of OpenCL compiled program
// from files of file names "(program_file_name)_(name of device).bin"
UTILS_EXPORT
cl_program cl_util_read_binaries(const cl_context context,
                                 const cl_device_id* const devices,
                                 const cl_uint num_devices,
                                 const char* const program_file_name,
                                 cl_int* const error);

// returns the folder containing the running executable
UTILS_EXPORT
cl_int cl_util_executable_folder(char* filename, size_t* const length);

// read all the text file contents securely in ANSI C89
// return pointer to C-string with file contents
// interprets filename relative to the folder containing
// the running executable
UTILS_EXPORT
char* cl_util_read_exe_relative_text_file(const char* const rel_path,
                                          size_t* const length,
                                          cl_int* const error);


================================================
FILE: svm/OpenCL/include/CL/Utils/File.hpp
================================================
#pragma once

// OpenCL SDK includes
#include "OpenCLUtilsCpp_Export.h"

#include <CL/Utils/Error.hpp>

// OpenCL includes
#include <CL/opencl.hpp>


namespace cl {
namespace util {

    std::string UTILSCPP_EXPORT read_text_file(const char* const filename,
                                               cl_int* const error = nullptr);

    std::vector<unsigned char> UTILSCPP_EXPORT
    read_binary_file(const char* const filename, cl_int* const error = nullptr);

    Program::Binaries UTILSCPP_EXPORT read_binary_files(
        const std::vector<cl::Device>& devices,
        const char* const program_file_name, cl_int* const error = nullptr);

    cl_int UTILSCPP_EXPORT
    write_binaries(const cl::Program::Binaries& binaries,
                   const std::vector<cl::Device>& devices,
                   const char* const program_file_name);

    std::string UTILSCPP_EXPORT
    executable_folder(cl_int* const error = nullptr);

    std::string UTILSCPP_EXPORT read_exe_relative_text_file(
        const char* const filename, cl_int* const error = nullptr);
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/InteropContext.hpp
================================================
#pragma once

#include "OpenCLUtilsCpp_Export.h"
#include <CL/Utils/Error.hpp>

#include <CL/opencl.hpp>

namespace cl {
namespace util {
    vector<cl_context_properties>
        UTILSCPP_EXPORT get_interop_context_properties(const cl::Device& plat,
                                                       cl_int* error = nullptr);

    Context UTILSCPP_EXPORT get_interop_context(int plat_id, int dev_id,
                                                cl_device_type type,
                                                cl_int* error = nullptr);
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/OpenCLUtilsCpp_Export.h
================================================

#ifndef UTILSCPP_EXPORT_H
#define UTILSCPP_EXPORT_H

#ifdef OPENCLUTILSCPP_STATIC_DEFINE
#  define UTILSCPP_EXPORT
#  define OPENCLUTILSCPP_NO_EXPORT
#else
#  ifndef UTILSCPP_EXPORT
#    ifdef OpenCLUtilsCpp_EXPORTS
        /* We are building this library */
#      define UTILSCPP_EXPORT 
#    else
        /* We are using this library */
#      define UTILSCPP_EXPORT 
#    endif
#  endif

#  ifndef OPENCLUTILSCPP_NO_EXPORT
#    define OPENCLUTILSCPP_NO_EXPORT 
#  endif
#endif

#ifndef OPENCLUTILSCPP_DEPRECATED
#  define OPENCLUTILSCPP_DEPRECATED __declspec(deprecated)
#endif

#ifndef OPENCLUTILSCPP_DEPRECATED_EXPORT
#  define OPENCLUTILSCPP_DEPRECATED_EXPORT UTILSCPP_EXPORT OPENCLUTILSCPP_DEPRECATED
#endif

#ifndef OPENCLUTILSCPP_DEPRECATED_NO_EXPORT
#  define OPENCLUTILSCPP_DEPRECATED_NO_EXPORT OPENCLUTILSCPP_NO_EXPORT OPENCLUTILSCPP_DEPRECATED
#endif

/* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */
#if 0 /* DEFINE_NO_DEPRECATED */
#  ifndef OPENCLUTILSCPP_NO_DEPRECATED
#    define OPENCLUTILSCPP_NO_DEPRECATED
#  endif
#endif

#endif /* UTILSCPP_EXPORT_H */


================================================
FILE: svm/OpenCL/include/CL/Utils/OpenCLUtils_Export.h
================================================

#ifndef UTILS_EXPORT_H
#define UTILS_EXPORT_H

#ifdef OPENCLUTILS_STATIC_DEFINE
#  define UTILS_EXPORT
#  define OPENCLUTILS_NO_EXPORT
#else
#  ifndef UTILS_EXPORT
#    ifdef OpenCLUtils_EXPORTS
        /* We are building this library */
#      define UTILS_EXPORT 
#    else
        /* We are using this library */
#      define UTILS_EXPORT 
#    endif
#  endif

#  ifndef OPENCLUTILS_NO_EXPORT
#    define OPENCLUTILS_NO_EXPORT 
#  endif
#endif

#ifndef OPENCLUTILS_DEPRECATED
#  define OPENCLUTILS_DEPRECATED __declspec(deprecated)
#endif

#ifndef OPENCLUTILS_DEPRECATED_EXPORT
#  define OPENCLUTILS_DEPRECATED_EXPORT UTILS_EXPORT OPENCLUTILS_DEPRECATED
#endif

#ifndef OPENCLUTILS_DEPRECATED_NO_EXPORT
#  define OPENCLUTILS_DEPRECATED_NO_EXPORT OPENCLUTILS_NO_EXPORT OPENCLUTILS_DEPRECATED
#endif

/* NOLINTNEXTLINE(readability-avoid-unconditional-preprocessor-if) */
#if 0 /* DEFINE_NO_DEPRECATED */
#  ifndef OPENCLUTILS_NO_DEPRECATED
#    define OPENCLUTILS_NO_DEPRECATED
#  endif
#endif

#endif /* UTILS_EXPORT_H */


================================================
FILE: svm/OpenCL/include/CL/Utils/Platform.hpp
================================================
#pragma once

#include "OpenCLUtilsCpp_Export.h"
#include <CL/Utils/Error.hpp>

#include <CL/opencl.hpp>

namespace cl {
namespace util {
    bool UTILSCPP_EXPORT supports_extension(const cl::Platform& platform,
                                            const cl::string& extension);

    bool UTILSCPP_EXPORT platform_version_contains(
        const cl::Platform& platform, const cl::string& version_fragment);
}
}


================================================
FILE: svm/OpenCL/include/CL/Utils/Utils.h
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

#include <CL/Utils/Error.h>
#include <CL/Utils/File.h>
#include <CL/Utils/Context.h>

// OpenCL includes
#include <CL/cl.h>


================================================
FILE: svm/OpenCL/include/CL/Utils/Utils.hpp
================================================
#pragma once

// OpenCL Utils includes
#include "OpenCLUtils_Export.h"

#include <CL/Utils/Detail.hpp>
#include <CL/Utils/Error.hpp>
#include <CL/Utils/Platform.hpp>
#include <CL/Utils/Device.hpp>
#include <CL/Utils/Context.hpp>
#include <CL/Utils/Event.hpp>
#include <CL/Utils/File.hpp>

// OpenCL includes
#include <CL/opencl.hpp>


================================================
FILE: svm/OpenCL/include/CL/cl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_CL_H
#define __OPENCL_CL_H

#include <CL/cl_version.h>
#include <CL/cl_platform.h>

#ifdef __cplusplus
extern "C" {
#endif

/******************************************************************************/

typedef struct _cl_platform_id *    cl_platform_id;
typedef struct _cl_device_id *      cl_device_id;
typedef struct _cl_context *        cl_context;
typedef struct _cl_command_queue *  cl_command_queue;
typedef struct _cl_mem *            cl_mem;
typedef struct _cl_program *        cl_program;
typedef struct _cl_kernel *         cl_kernel;
typedef struct _cl_event *          cl_event;
typedef struct _cl_sampler *        cl_sampler;

typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong            cl_bitfield;
typedef cl_ulong            cl_properties;
typedef cl_bitfield         cl_device_type;
typedef cl_uint             cl_platform_info;
typedef cl_uint             cl_device_info;
typedef cl_bitfield         cl_device_fp_config;
typedef cl_uint             cl_device_mem_cache_type;
typedef cl_uint             cl_device_local_mem_type;
typedef cl_bitfield         cl_device_exec_capabilities;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_device_svm_capabilities;
#endif
typedef cl_bitfield         cl_command_queue_properties;
#ifdef CL_VERSION_1_2
typedef intptr_t            cl_device_partition_property;
typedef cl_bitfield         cl_device_affinity_domain;
#endif

typedef intptr_t            cl_context_properties;
typedef cl_uint             cl_context_info;
#ifdef CL_VERSION_2_0
typedef cl_properties       cl_queue_properties;
#endif
typedef cl_uint             cl_command_queue_info;
typedef cl_uint             cl_channel_order;
typedef cl_uint             cl_channel_type;
typedef cl_bitfield         cl_mem_flags;
#ifdef CL_VERSION_2_0
typedef cl_bitfield         cl_svm_mem_flags;
#endif
typedef cl_uint             cl_mem_object_type;
typedef cl_uint             cl_mem_info;
#ifdef CL_VERSION_1_2
typedef cl_bitfield         cl_mem_migration_flags;
#endif
typedef cl_uint             cl_image_info;
#ifdef CL_VERSION_1_1
typedef cl_uint             cl_buffer_create_type;
#endif
typedef cl_uint             cl_addressing_mode;
typedef cl_uint             cl_filter_mode;
typedef cl_uint             cl_sampler_info;
typedef cl_bitfield         cl_map_flags;
#ifdef CL_VERSION_2_0
typedef intptr_t            cl_pipe_properties;
typedef cl_uint             cl_pipe_info;
#endif
typedef cl_uint             cl_program_info;
typedef cl_uint             cl_program_build_info;
#ifdef CL_VERSION_1_2
typedef cl_uint             cl_program_binary_type;
#endif
typedef cl_int              cl_build_status;
typedef cl_uint             cl_kernel_info;
#ifdef CL_VERSION_1_2
typedef cl_uint             cl_kernel_arg_info;
typedef cl_uint             cl_kernel_arg_address_qualifier;
typedef cl_uint             cl_kernel_arg_access_qualifier;
typedef cl_bitfield         cl_kernel_arg_type_qualifier;
#endif
typedef cl_uint             cl_kernel_work_group_info;
#ifdef CL_VERSION_2_1
typedef cl_uint             cl_kernel_sub_group_info;
#endif
typedef cl_uint             cl_event_info;
typedef cl_uint             cl_command_type;
typedef cl_uint             cl_profiling_info;
#ifdef CL_VERSION_2_0
typedef cl_properties       cl_sampler_properties;
typedef cl_uint             cl_kernel_exec_info;
#endif
#ifdef CL_VERSION_3_0
typedef cl_bitfield         cl_device_atomic_capabilities;
typedef cl_bitfield         cl_device_device_enqueue_capabilities;
typedef cl_uint             cl_khronos_vendor_id;
typedef cl_properties cl_mem_properties;
#endif
typedef cl_uint cl_version;

typedef struct _cl_image_format {
    cl_channel_order        image_channel_order;
    cl_channel_type         image_channel_data_type;
} cl_image_format;

#ifdef CL_VERSION_1_2

typedef struct _cl_image_desc {
    cl_mem_object_type      image_type;
    size_t                  image_width;
    size_t                  image_height;
    size_t                  image_depth;
    size_t                  image_array_size;
    size_t                  image_row_pitch;
    size_t                  image_slice_pitch;
    cl_uint                 num_mip_levels;
    cl_uint                 num_samples;
#ifdef CL_VERSION_2_0
#if defined(__GNUC__)
    __extension__                   /* Prevents warnings about anonymous union in -pedantic builds */
#endif
#if defined(_MSC_VER) && !defined(__STDC__)
#pragma warning( push )
#pragma warning( disable : 4201 )   /* Prevents warning about nameless struct/union in /W4 builds */
#endif
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wc11-extensions" /* Prevents warning about nameless union being C11 extension*/
#endif
#if defined(_MSC_VER) && defined(__STDC__)
    /* Anonymous unions are not supported in /Za builds */
#else
    union {
#endif
#endif
      cl_mem                  buffer;
#ifdef CL_VERSION_2_0
#if defined(_MSC_VER) && defined(__STDC__)
    /* Anonymous unions are not supported in /Za builds */
#else
      cl_mem                  mem_object;
    };
#endif
#if defined(_MSC_VER) && !defined(__STDC__)
#pragma warning( pop )
#endif
#ifdef __clang__
#pragma clang diagnostic pop
#endif
#endif
} cl_image_desc;

#endif

#ifdef CL_VERSION_1_1

typedef struct _cl_buffer_region {
    size_t                  origin;
    size_t                  size;
} cl_buffer_region;

#endif

#ifdef CL_VERSION_3_0

#define CL_NAME_VERSION_MAX_NAME_SIZE 64

typedef struct _cl_name_version {
    cl_version              version;
    char                    name[CL_NAME_VERSION_MAX_NAME_SIZE];
} cl_name_version;

#endif

/******************************************************************************/

/* Error Codes */
#define CL_SUCCESS                                  0
#define CL_DEVICE_NOT_FOUND                         -1
#define CL_DEVICE_NOT_AVAILABLE                     -2
#define CL_COMPILER_NOT_AVAILABLE                   -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
#define CL_OUT_OF_RESOURCES                         -5
#define CL_OUT_OF_HOST_MEMORY                       -6
#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
#define CL_MEM_COPY_OVERLAP                         -8
#define CL_IMAGE_FORMAT_MISMATCH                    -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
#define CL_BUILD_PROGRAM_FAILURE                    -11
#define CL_MAP_FAILURE                              -12
#ifdef CL_VERSION_1_1
#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
#endif
#ifdef CL_VERSION_1_2
#define CL_COMPILE_PROGRAM_FAILURE                  -15
#define CL_LINKER_NOT_AVAILABLE                     -16
#define CL_LINK_PROGRAM_FAILURE                     -17
#define CL_DEVICE_PARTITION_FAILED                  -18
#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
#endif

#define CL_INVALID_VALUE                            -30
#define CL_INVALID_DEVICE_TYPE                      -31
#define CL_INVALID_PLATFORM                         -32
#define CL_INVALID_DEVICE                           -33
#define CL_INVALID_CONTEXT                          -34
#define CL_INVALID_QUEUE_PROPERTIES                 -35
#define CL_INVALID_COMMAND_QUEUE                    -36
#define CL_INVALID_HOST_PTR                         -37
#define CL_INVALID_MEM_OBJECT                       -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
#define CL_INVALID_IMAGE_SIZE                       -40
#define CL_INVALID_SAMPLER                          -41
#define CL_INVALID_BINARY                           -42
#define CL_INVALID_BUILD_OPTIONS                    -43
#define CL_INVALID_PROGRAM                          -44
#define CL_INVALID_PROGRAM_EXECUTABLE               -45
#define CL_INVALID_KERNEL_NAME                      -46
#define CL_INVALID_KERNEL_DEFINITION                -47
#define CL_INVALID_KERNEL                           -48
#define CL_INVALID_ARG_INDEX                        -49
#define CL_INVALID_ARG_VALUE                        -50
#define CL_INVALID_ARG_SIZE                         -51
#define CL_INVALID_KERNEL_ARGS                      -52
#define CL_INVALID_WORK_DIMENSION                   -53
#define CL_INVALID_WORK_GROUP_SIZE                  -54
#define CL_INVALID_WORK_ITEM_SIZE                   -55
#define CL_INVALID_GLOBAL_OFFSET                    -56
#define CL_INVALID_EVENT_WAIT_LIST                  -57
#define CL_INVALID_EVENT                            -58
#define CL_INVALID_OPERATION                        -59
#define CL_INVALID_GL_OBJECT                        -60
#define CL_INVALID_BUFFER_SIZE                      -61
#define CL_INVALID_MIP_LEVEL                        -62
#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
#ifdef CL_VERSION_1_1
#define CL_INVALID_PROPERTY                         -64
#endif
#ifdef CL_VERSION_1_2
#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
#define CL_INVALID_COMPILER_OPTIONS                 -66
#define CL_INVALID_LINKER_OPTIONS                   -67
#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
#endif
#ifdef CL_VERSION_2_0
#define CL_INVALID_PIPE_SIZE                        -69
#define CL_INVALID_DEVICE_QUEUE                     -70
#endif
#ifdef CL_VERSION_2_2
#define CL_INVALID_SPEC_ID                          -71
#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
#endif


/* cl_bool */
#define CL_FALSE                                    0
#define CL_TRUE                                     1
#ifdef CL_VERSION_1_2
#define CL_BLOCKING                                 CL_TRUE
#define CL_NON_BLOCKING                             CL_FALSE
#endif

/* cl_platform_info */
#define CL_PLATFORM_PROFILE                         0x0900
#define CL_PLATFORM_VERSION                         0x0901
#define CL_PLATFORM_NAME                            0x0902
#define CL_PLATFORM_VENDOR                          0x0903
#define CL_PLATFORM_EXTENSIONS                      0x0904
#ifdef CL_VERSION_2_1
#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
#endif
#ifdef CL_VERSION_3_0
#define CL_PLATFORM_NUMERIC_VERSION                 0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION         0x0907
#endif

/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
#define CL_DEVICE_TYPE_CPU                          (1 << 1)
#define CL_DEVICE_TYPE_GPU                          (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
#ifdef CL_VERSION_1_2
#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
#endif
#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF

/* cl_device_info */
#define CL_DEVICE_TYPE                                   0x1000
#define CL_DEVICE_VENDOR_ID                              0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
#define CL_DEVICE_ADDRESS_BITS                           0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
#define CL_DEVICE_MAX_SAMPLERS                           0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
#define CL_DEVICE_AVAILABLE                              0x1027
#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
#ifdef CL_VERSION_2_0
#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
#endif
#define CL_DEVICE_NAME                                   0x102B
#define CL_DEVICE_VENDOR                                 0x102C
#define CL_DRIVER_VERSION                                0x102D
#define CL_DEVICE_PROFILE                                0x102E
#define CL_DEVICE_VERSION                                0x102F
#define CL_DEVICE_EXTENSIONS                             0x1030
#define CL_DEVICE_PLATFORM                               0x1031
#ifdef CL_VERSION_1_2
#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
#endif
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
#ifdef CL_VERSION_1_1
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
#endif
#ifdef CL_VERSION_1_2
#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
#define CL_DEVICE_PARENT_DEVICE                          0x1042
#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
#define CL_DEVICE_PARTITION_TYPE                         0x1046
#define CL_DEVICE_REFERENCE_COUNT                        0x1047
#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
#endif
#ifdef CL_VERSION_2_0
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
#endif
#ifdef CL_VERSION_2_1
#define CL_DEVICE_IL_VERSION                             0x105B
#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
#endif
#ifdef CL_VERSION_3_0
#define CL_DEVICE_NUMERIC_VERSION                        0x105E
#define CL_DEVICE_EXTENSIONS_WITH_VERSION                0x1060
#define CL_DEVICE_ILS_WITH_VERSION                       0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION          0x1062
#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES             0x1063
#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES              0x1064
#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT         0x1065
#define CL_DEVICE_OPENCL_C_ALL_VERSIONS                  0x1066
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE     0x1067
#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068
#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT          0x1069
/* 0x106A to 0x106E - Reserved for upcoming KHR extension */
#define CL_DEVICE_OPENCL_C_FEATURES                      0x106F
#define CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES            0x1070
#define CL_DEVICE_PIPE_SUPPORT                           0x1071
#define CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED      0x1072
#endif

/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM                                (1 << 0)
#define CL_FP_INF_NAN                               (1 << 1)
#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
#define CL_FP_ROUND_TO_INF                          (1 << 4)
#define CL_FP_FMA                                   (1 << 5)
#ifdef CL_VERSION_1_1
#define CL_FP_SOFT_FLOAT                            (1 << 6)
#endif
#ifdef CL_VERSION_1_2
#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
#endif

/* cl_device_mem_cache_type */
#define CL_NONE                                     0x0
#define CL_READ_ONLY_CACHE                          0x1
#define CL_READ_WRITE_CACHE                         0x2

/* cl_device_local_mem_type */
#define CL_LOCAL                                    0x1
#define CL_GLOBAL                                   0x2

/* cl_device_exec_capabilities - bitfield */
#define CL_EXEC_KERNEL                              (1 << 0)
#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)

/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
#ifdef CL_VERSION_2_0
#define CL_QUEUE_ON_DEVICE                          (1 << 2)
#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
#endif

/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
#define CL_CONTEXT_DEVICES                          0x1081
#define CL_CONTEXT_PROPERTIES                       0x1082
#ifdef CL_VERSION_1_1
#define CL_CONTEXT_NUM_DEVICES                      0x1083
#endif

/* cl_context_properties */
#define CL_CONTEXT_PLATFORM                         0x1084
#ifdef CL_VERSION_1_2
#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
#endif

#ifdef CL_VERSION_1_2

/* cl_device_partition_property */
#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088

#endif

#ifdef CL_VERSION_1_2

/* cl_device_affinity_domain */
#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)

#endif

#ifdef CL_VERSION_2_0

/* cl_device_svm_capabilities */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)

#endif

/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT                            0x1090
#define CL_QUEUE_DEVICE                             0x1091
#define CL_QUEUE_REFERENCE_COUNT                    0x1092
#define CL_QUEUE_PROPERTIES                         0x1093
#ifdef CL_VERSION_2_0
#define CL_QUEUE_SIZE                               0x1094
#endif
#ifdef CL_VERSION_2_1
#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
#endif
#ifdef CL_VERSION_3_0
#define CL_QUEUE_PROPERTIES_ARRAY                   0x1098
#endif

/* cl_mem_flags and cl_svm_mem_flags - bitfield */
#define CL_MEM_READ_WRITE                           (1 << 0)
#define CL_MEM_WRITE_ONLY                           (1 << 1)
#define CL_MEM_READ_ONLY                            (1 << 2)
#define CL_MEM_USE_HOST_PTR                         (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
/* reserved                                         (1 << 6)    */
#ifdef CL_VERSION_1_2
#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
#endif

#ifdef CL_VERSION_1_2

/* cl_mem_migration_flags - bitfield */
#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)

#endif

/* cl_channel_order */
#define CL_R                                        0x10B0
#define CL_A                                        0x10B1
#define CL_RG                                       0x10B2
#define CL_RA                                       0x10B3
#define CL_RGB                                      0x10B4
#define CL_RGBA                                     0x10B5
#define CL_BGRA                                     0x10B6
#define CL_ARGB                                     0x10B7
#define CL_INTENSITY                                0x10B8
#define CL_LUMINANCE                                0x10B9
#ifdef CL_VERSION_1_1
#define CL_Rx                                       0x10BA
#define CL_RGx                                      0x10BB
#define CL_RGBx                                     0x10BC
#endif
#ifdef CL_VERSION_2_0
#define CL_DEPTH                                    0x10BD
#define CL_sRGB                                     0x10BF
#define CL_sRGBx                                    0x10C0
#define CL_sRGBA                                    0x10C1
#define CL_sBGRA                                    0x10C2
#define CL_ABGR                                     0x10C3
#endif

/* cl_channel_type */
#define CL_SNORM_INT8                               0x10D0
#define CL_SNORM_INT16                              0x10D1
#define CL_UNORM_INT8                               0x10D2
#define CL_UNORM_INT16                              0x10D3
#define CL_UNORM_SHORT_565                          0x10D4
#define CL_UNORM_SHORT_555                          0x10D5
#define CL_UNORM_INT_101010                         0x10D6
#define CL_SIGNED_INT8                              0x10D7
#define CL_SIGNED_INT16                             0x10D8
#define CL_SIGNED_INT32                             0x10D9
#define CL_UNSIGNED_INT8                            0x10DA
#define CL_UNSIGNED_INT16                           0x10DB
#define CL_UNSIGNED_INT32                           0x10DC
#define CL_HALF_FLOAT                               0x10DD
#define CL_FLOAT                                    0x10DE
#ifdef CL_VERSION_2_1
#define CL_UNORM_INT_101010_2                       0x10E0
#endif

/* cl_mem_object_type */
#define CL_MEM_OBJECT_BUFFER                        0x10F0
#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
#ifdef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_OBJECT_PIPE                          0x10F7
#endif

/* cl_mem_info */
#define CL_MEM_TYPE                                 0x1100
#define CL_MEM_FLAGS                                0x1101
#define CL_MEM_SIZE                                 0x1102
#define CL_MEM_HOST_PTR                             0x1103
#define CL_MEM_MAP_COUNT                            0x1104
#define CL_MEM_REFERENCE_COUNT                      0x1105
#define CL_MEM_CONTEXT                              0x1106
#ifdef CL_VERSION_1_1
#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
#define CL_MEM_OFFSET                               0x1108
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_USES_SVM_POINTER                     0x1109
#endif
#ifdef CL_VERSION_3_0
#define CL_MEM_PROPERTIES                           0x110A
#endif

/* cl_image_info */
#define CL_IMAGE_FORMAT                             0x1110
#define CL_IMAGE_ELEMENT_SIZE                       0x1111
#define CL_IMAGE_ROW_PITCH                          0x1112
#define CL_IMAGE_SLICE_PITCH                        0x1113
#define CL_IMAGE_WIDTH                              0x1114
#define CL_IMAGE_HEIGHT                             0x1115
#define CL_IMAGE_DEPTH                              0x1116
#ifdef CL_VERSION_1_2
#define CL_IMAGE_ARRAY_SIZE                         0x1117
#define CL_IMAGE_BUFFER                             0x1118
#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
#define CL_IMAGE_NUM_SAMPLES                        0x111A
#endif


/* cl_pipe_info */
#ifdef CL_VERSION_2_0
#define CL_PIPE_PACKET_SIZE                         0x1120
#define CL_PIPE_MAX_PACKETS                         0x1121
#endif
#ifdef CL_VERSION_3_0
#define CL_PIPE_PROPERTIES                          0x1122
#endif

/* cl_addressing_mode */
#define CL_ADDRESS_NONE                             0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
#define CL_ADDRESS_CLAMP                            0x1132
#define CL_ADDRESS_REPEAT                           0x1133
#ifdef CL_VERSION_1_1
#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
#endif

/* cl_filter_mode */
#define CL_FILTER_NEAREST                           0x1140
#define CL_FILTER_LINEAR                            0x1141

/* cl_sampler_info */
#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
#define CL_SAMPLER_CONTEXT                          0x1151
#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
#define CL_SAMPLER_FILTER_MODE                      0x1154
#ifdef CL_VERSION_2_0
/* These enumerants are for the cl_khr_mipmap_image extension.
   They have since been added to cl_ext.h with an appropriate
   KHR suffix, but are left here for backwards compatibility. */
#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
#define CL_SAMPLER_LOD_MIN                          0x1156
#define CL_SAMPLER_LOD_MAX                          0x1157
#endif
#ifdef CL_VERSION_3_0
#define CL_SAMPLER_PROPERTIES                       0x1158
#endif

/* cl_map_flags - bitfield */
#define CL_MAP_READ                                 (1 << 0)
#define CL_MAP_WRITE                                (1 << 1)
#ifdef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
#endif

/* cl_program_info */
#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
#define CL_PROGRAM_CONTEXT                          0x1161
#define CL_PROGRAM_NUM_DEVICES                      0x1162
#define CL_PROGRAM_DEVICES                          0x1163
#define CL_PROGRAM_SOURCE                           0x1164
#define CL_PROGRAM_BINARY_SIZES                     0x1165
#define CL_PROGRAM_BINARIES                         0x1166
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_NUM_KERNELS                      0x1167
#define CL_PROGRAM_KERNEL_NAMES                     0x1168
#endif
#ifdef CL_VERSION_2_1
#define CL_PROGRAM_IL                               0x1169
#endif
#ifdef CL_VERSION_2_2
#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
#endif

/* cl_program_build_info */
#define CL_PROGRAM_BUILD_STATUS                     0x1181
#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
#define CL_PROGRAM_BUILD_LOG                        0x1183
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_BINARY_TYPE                      0x1184
#endif
#ifdef CL_VERSION_2_0
#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
#endif

#ifdef CL_VERSION_1_2

/* cl_program_binary_type */
#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4

#endif

/* cl_build_status */
#define CL_BUILD_SUCCESS                            0
#define CL_BUILD_NONE                               -1
#define CL_BUILD_ERROR                              -2
#define CL_BUILD_IN_PROGRESS                        -3

/* cl_kernel_info */
#define CL_KERNEL_FUNCTION_NAME                     0x1190
#define CL_KERNEL_NUM_ARGS                          0x1191
#define CL_KERNEL_REFERENCE_COUNT                   0x1192
#define CL_KERNEL_CONTEXT                           0x1193
#define CL_KERNEL_PROGRAM                           0x1194
#ifdef CL_VERSION_1_2
#define CL_KERNEL_ATTRIBUTES                        0x1195
#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_info */
#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
#define CL_KERNEL_ARG_NAME                          0x119A

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_address_qualifier */
#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_access_qualifier */
#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3

#endif

#ifdef CL_VERSION_1_2

/* cl_kernel_arg_type_qualifier */
#define CL_KERNEL_ARG_TYPE_NONE                     0
#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
#ifdef CL_VERSION_2_0
#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
#endif

#endif

/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
#ifdef CL_VERSION_1_2
#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
#endif

#ifdef CL_VERSION_2_1

/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA

#endif

#ifdef CL_VERSION_2_0

/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7

#endif

/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE                      0x11D0
#define CL_EVENT_COMMAND_TYPE                       0x11D1
#define CL_EVENT_REFERENCE_COUNT                    0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
#ifdef CL_VERSION_1_1
#define CL_EVENT_CONTEXT                            0x11D4
#endif

/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
#define CL_COMMAND_TASK                             0x11F1
#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
#define CL_COMMAND_READ_BUFFER                      0x11F3
#define CL_COMMAND_WRITE_BUFFER                     0x11F4
#define CL_COMMAND_COPY_BUFFER                      0x11F5
#define CL_COMMAND_READ_IMAGE                       0x11F6
#define CL_COMMAND_WRITE_IMAGE                      0x11F7
#define CL_COMMAND_COPY_IMAGE                       0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
#define CL_COMMAND_MAP_BUFFER                       0x11FB
#define CL_COMMAND_MAP_IMAGE                        0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
#define CL_COMMAND_MARKER                           0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
#ifdef CL_VERSION_1_1
#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
#define CL_COMMAND_USER                             0x1204
#endif
#ifdef CL_VERSION_1_2
#define CL_COMMAND_BARRIER                          0x1205
#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
#define CL_COMMAND_FILL_BUFFER                      0x1207
#define CL_COMMAND_FILL_IMAGE                       0x1208
#endif
#ifdef CL_VERSION_2_0
#define CL_COMMAND_SVM_FREE                         0x1209
#define CL_COMMAND_SVM_MEMCPY                       0x120A
#define CL_COMMAND_SVM_MEMFILL                      0x120B
#define CL_COMMAND_SVM_MAP                          0x120C
#define CL_COMMAND_SVM_UNMAP                        0x120D
#endif
#ifdef CL_VERSION_3_0
#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E
#endif

/* command execution status */
#define CL_COMPLETE                                 0x0
#define CL_RUNNING                                  0x1
#define CL_SUBMITTED                                0x2
#define CL_QUEUED                                   0x3

/* cl_buffer_create_type */
#ifdef CL_VERSION_1_1
#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
#endif

/* cl_profiling_info */
#define CL_PROFILING_COMMAND_QUEUED                 0x1280
#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
#define CL_PROFILING_COMMAND_START                  0x1282
#define CL_PROFILING_COMMAND_END                    0x1283
#ifdef CL_VERSION_2_0
#define CL_PROFILING_COMMAND_COMPLETE               0x1284
#endif

/* cl_device_atomic_capabilities - bitfield */
#ifdef CL_VERSION_3_0
#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)
#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)
#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)
#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)
#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES      (1 << 6)
#endif

/* cl_device_device_enqueue_capabilities - bitfield */
#ifdef CL_VERSION_3_0
#define CL_DEVICE_QUEUE_SUPPORTED               (1 << 0)
#define CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT     (1 << 1)
#endif

/* cl_khronos_vendor_id */
#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004

/* cl_version */
#define CL_VERSION_MAJOR_BITS (10)
#define CL_VERSION_MINOR_BITS (10)
#define CL_VERSION_PATCH_BITS (12)

#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)
#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)
#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)

#define CL_VERSION_MAJOR(version) \
  ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))

#define CL_VERSION_MINOR(version) \
  (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)

#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)

#define CL_MAKE_VERSION(major, minor, patch)                      \
  ((((major) & CL_VERSION_MAJOR_MASK)                             \
       << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) |      \
   (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \
   ((patch) & CL_VERSION_PATCH_MASK))

/********************************************************************************************************/

/* CL_NO_PROTOTYPES implies CL_NO_CORE_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_CORE_PROTOTYPES)
#define CL_NO_CORE_PROTOTYPES
#endif

#if !defined(CL_NO_CORE_PROTOTYPES)

/* Platform API */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformIDs(cl_uint          num_entries,
                 cl_platform_id * platforms,
                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformInfo(cl_platform_id   platform,
                  cl_platform_info param_name,
                  size_t           param_value_size,
                  void *           param_value,
                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Device APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDs(cl_platform_id   platform,
               cl_device_type   device_type,
               cl_uint          num_entries,
               cl_device_id *   devices,
               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceInfo(cl_device_id    device,
                cl_device_info  param_name,
                size_t          param_value_size,
                void *          param_value,
                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevices(cl_device_id                         in_device,
                   const cl_device_partition_property * properties,
                   cl_uint                              num_devices,
                   cl_device_id *                       out_devices,
                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetDefaultDeviceCommandQueue(cl_context           context,
                               cl_device_id         device,
                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceAndHostTimer(cl_device_id    device,
                        cl_ulong*       device_timestamp,
                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetHostTimer(cl_device_id device,
               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;

#endif

/* Context APIs */
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContext(const cl_context_properties * properties,
                cl_uint              num_devices,
                const cl_device_id * devices,
                void (CL_CALLBACK * pfn_notify)(const char * errinfo,
                                                const void * private_info,
                                                size_t       cb,
                                                void *       user_data),
                void *               user_data,
                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContextFromType(const cl_context_properties * properties,
                        cl_device_type      device_type,
                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,
                                                        const void * private_info,
                                                        size_t       cb,
                                                        void *       user_data),
                        void *              user_data,
                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetContextInfo(cl_context         context,
                 cl_context_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_3_0

extern CL_API_ENTRY cl_int CL_API_CALL
clSetContextDestructorCallback(cl_context         context,
                               void (CL_CALLBACK* pfn_notify)(cl_context context,
                                                              void* user_data),
                               void*              user_data) CL_API_SUFFIX__VERSION_3_0;

#endif

/* Command Queue APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithProperties(cl_context               context,
                                   cl_device_id             device,
                                   const cl_queue_properties *    properties,
                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandQueueInfo(cl_command_queue      command_queue,
                      cl_command_queue_info param_name,
                      size_t                param_value_size,
                      void *                param_value,
                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Memory Object APIs */
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBuffer(cl_context   context,
               cl_mem_flags flags,
               size_t       size,
               void *       host_ptr,
               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateSubBuffer(cl_mem                   buffer,
                  cl_mem_flags             flags,
                  cl_buffer_create_type    buffer_create_type,
                  const void *             buffer_create_info,
                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage(cl_context              context,
              cl_mem_flags            flags,
              const cl_image_format * image_format,
              const cl_image_desc *   image_desc,
              void *                  host_ptr,
              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreatePipe(cl_context                 context,
             cl_mem_flags               flags,
             cl_uint                    pipe_packet_size,
             cl_uint                    pipe_max_packets,
             const cl_pipe_properties * properties,
             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_3_0

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferWithProperties(cl_context                context,
                             const cl_mem_properties * properties,
                             cl_mem_flags              flags,
                             size_t                    size,
                             void *                    host_ptr,
                             cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImageWithProperties(cl_context                context,
                            const cl_mem_properties * properties,
                            cl_mem_flags              flags,
                            const cl_image_format *   image_format,
                            const cl_image_desc *     image_desc,
                            void *                    host_ptr,
                            cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedImageFormats(cl_context           context,
                           cl_mem_flags         flags,
                           cl_mem_object_type   image_type,
                           cl_uint              num_entries,
                           cl_image_format *    image_formats,
                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemObjectInfo(cl_mem           memobj,
                   cl_mem_info      param_name,
                   size_t           param_value_size,
                   void *           param_value,
                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageInfo(cl_mem           image,
               cl_image_info    param_name,
               size_t           param_value_size,
               void *           param_value,
               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clGetPipeInfo(cl_mem           pipe,
              cl_pipe_info     param_name,
              size_t           param_value_size,
              void *           param_value,
              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetMemObjectDestructorCallback(cl_mem memobj,
                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
                                                                 void * user_data),
                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;

#endif

/* SVM Allocation APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY void * CL_API_CALL
clSVMAlloc(cl_context       context,
           cl_svm_mem_flags flags,
           size_t           size,
           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY void CL_API_CALL
clSVMFree(cl_context        context,
          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;

#endif

/* Sampler APIs */

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_sampler CL_API_CALL
clCreateSamplerWithProperties(cl_context                     context,
                              const cl_sampler_properties *  sampler_properties,
                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSamplerInfo(cl_sampler         sampler,
                 cl_sampler_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context        context,
                          cl_uint           count,
                          const char **     strings,
                          const size_t *    lengths,
                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBinary(cl_context                     context,
                          cl_uint                        num_devices,
                          const cl_device_id *           device_list,
                          const size_t *                 lengths,
                          const unsigned char **         binaries,
                          cl_int *                       binary_status,
                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBuiltInKernels(cl_context            context,
                                  cl_uint               num_devices,
                                  const cl_device_id *  device_list,
                                  const char *          kernel_names,
                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithIL(cl_context    context,
                     const void*    il,
                     size_t         length,
                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clBuildProgram(cl_program           program,
               cl_uint              num_devices,
               const cl_device_id * device_list,
               const char *         options,
               void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                                void * user_data),
               void *               user_data) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clCompileProgram(cl_program           program,
                 cl_uint              num_devices,
                 const cl_device_id * device_list,
                 const char *         options,
                 cl_uint              num_input_headers,
                 const cl_program *   input_headers,
                 const char **        header_include_names,
                 void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                                  void * user_data),
                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_program CL_API_CALL
clLinkProgram(cl_context           context,
              cl_uint              num_devices,
              const cl_device_id * device_list,
              const char *         options,
              cl_uint              num_input_programs,
              const cl_program *   input_programs,
              void (CL_CALLBACK *  pfn_notify)(cl_program program,
                                               void * user_data),
              void *               user_data,
              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_2

extern CL_API_ENTRY CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL
clSetProgramReleaseCallback(cl_program          program,
                            void (CL_CALLBACK * pfn_notify)(cl_program program,
                                                            void * user_data),
                            void *              user_data) CL_API_SUFFIX__VERSION_2_2_DEPRECATED;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetProgramSpecializationConstant(cl_program  program,
                                   cl_uint     spec_id,
                                   size_t      spec_size,
                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program         program,
                 cl_program_info    param_name,
                 size_t             param_value_size,
                 void *             param_value,
                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramBuildInfo(cl_program            program,
                      cl_device_id          device,
                      cl_program_build_info param_name,
                      size_t                param_value_size,
                      void *                param_value,
                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Kernel Object APIs */
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCreateKernel(cl_program      program,
               const char *    kernel_name,
               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clCreateKernelsInProgram(cl_program     program,
                         cl_uint        num_kernels,
                         cl_kernel *    kernels,
                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_kernel CL_API_CALL
clCloneKernel(cl_kernel     source_kernel,
              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArg(cl_kernel    kernel,
               cl_uint      arg_index,
               size_t       arg_size,
               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointer(cl_kernel    kernel,
                         cl_uint      arg_index,
                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfo(cl_kernel            kernel,
                    cl_kernel_exec_info  param_name,
                    size_t               param_value_size,
                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel       kernel,
                cl_kernel_info  param_name,
                size_t          param_value_size,
                void *          param_value,
                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelArgInfo(cl_kernel       kernel,
                   cl_uint         arg_indx,
                   cl_kernel_arg_info  param_name,
                   size_t          param_value_size,
                   void *          param_value,
                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelWorkGroupInfo(cl_kernel                  kernel,
                         cl_device_id               device,
                         cl_kernel_work_group_info  param_name,
                         size_t                     param_value_size,
                         void *                     param_value,
                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfo(cl_kernel                   kernel,
                        cl_device_id                device,
                        cl_kernel_sub_group_info    param_name,
                        size_t                      input_value_size,
                        const void*                 input_value,
                        size_t                      param_value_size,
                        void*                       param_value,
                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;

#endif

/* Event Object APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clWaitForEvents(cl_uint             num_events,
                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventInfo(cl_event         event,
               cl_event_info    param_name,
               size_t           param_value_size,
               void *           param_value,
               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateUserEvent(cl_context    context,
                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clSetUserEventStatus(cl_event   event,
                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetEventCallback(cl_event    event,
                   cl_int      command_exec_callback_type,
                   void (CL_CALLBACK * pfn_notify)(cl_event event,
                                                   cl_int   event_command_status,
                                                   void *   user_data),
                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;

#endif

/* Profiling APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event            event,
                        cl_profiling_info   param_name,
                        size_t              param_value_size,
                        void *              param_value,
                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

/* Flush and Finish APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;

/* Enqueued Commands APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBuffer(cl_command_queue    command_queue,
                    cl_mem              buffer,
                    cl_bool             blocking_read,
                    size_t              offset,
                    size_t              size,
                    void *              ptr,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBufferRect(cl_command_queue    command_queue,
                        cl_mem              buffer,
                        cl_bool             blocking_read,
                        const size_t *      buffer_origin,
                        const size_t *      host_origin,
                        const size_t *      region,
                        size_t              buffer_row_pitch,
                        size_t              buffer_slice_pitch,
                        size_t              host_row_pitch,
                        size_t              host_slice_pitch,
                        void *              ptr,
                        cl_uint             num_events_in_wait_list,
                        const cl_event *    event_wait_list,
                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBuffer(cl_command_queue   command_queue,
                     cl_mem             buffer,
                     cl_bool            blocking_write,
                     size_t             offset,
                     size_t             size,
                     const void *       ptr,
                     cl_uint            num_events_in_wait_list,
                     const cl_event *   event_wait_list,
                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBufferRect(cl_command_queue    command_queue,
                         cl_mem              buffer,
                         cl_bool             blocking_write,
                         const size_t *      buffer_origin,
                         const size_t *      host_origin,
                         const size_t *      region,
                         size_t              buffer_row_pitch,
                         size_t              buffer_slice_pitch,
                         size_t              host_row_pitch,
                         size_t              host_slice_pitch,
                         const void *        ptr,
                         cl_uint             num_events_in_wait_list,
                         const cl_event *    event_wait_list,
                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillBuffer(cl_command_queue   command_queue,
                    cl_mem             buffer,
                    const void *       pattern,
                    size_t             pattern_size,
                    size_t             offset,
                    size_t             size,
                    cl_uint            num_events_in_wait_list,
                    const cl_event *   event_wait_list,
                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBuffer(cl_command_queue    command_queue,
                    cl_mem              src_buffer,
                    cl_mem              dst_buffer,
                    size_t              src_offset,
                    size_t              dst_offset,
                    size_t              size,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferRect(cl_command_queue    command_queue,
                        cl_mem              src_buffer,
                        cl_mem              dst_buffer,
                        const size_t *      src_origin,
                        const size_t *      dst_origin,
                        const size_t *      region,
                        size_t              src_row_pitch,
                        size_t              src_slice_pitch,
                        size_t              dst_row_pitch,
                        size_t              dst_slice_pitch,
                        cl_uint             num_events_in_wait_list,
                        const cl_event *    event_wait_list,
                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadImage(cl_command_queue     command_queue,
                   cl_mem               image,
                   cl_bool              blocking_read,
                   const size_t *       origin,
                   const size_t *       region,
                   size_t               row_pitch,
                   size_t               slice_pitch,
                   void *               ptr,
                   cl_uint              num_events_in_wait_list,
                   const cl_event *     event_wait_list,
                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteImage(cl_command_queue    command_queue,
                    cl_mem              image,
                    cl_bool             blocking_write,
                    const size_t *      origin,
                    const size_t *      region,
                    size_t              input_row_pitch,
                    size_t              input_slice_pitch,
                    const void *        ptr,
                    cl_uint             num_events_in_wait_list,
                    const cl_event *    event_wait_list,
                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillImage(cl_command_queue   command_queue,
                   cl_mem             image,
                   const void *       fill_color,
                   const size_t *     origin,
                   const size_t *     region,
                   cl_uint            num_events_in_wait_list,
                   const cl_event *   event_wait_list,
                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImage(cl_command_queue     command_queue,
                   cl_mem               src_image,
                   cl_mem               dst_image,
                   const size_t *       src_origin,
                   const size_t *       dst_origin,
                   const size_t *       region,
                   cl_uint              num_events_in_wait_list,
                   const cl_event *     event_wait_list,
                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
                           cl_mem           src_image,
                           cl_mem           dst_buffer,
                           const size_t *   src_origin,
                           const size_t *   region,
                           size_t           dst_offset,
                           cl_uint          num_events_in_wait_list,
                           const cl_event * event_wait_list,
                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferToImage(cl_command_queue command_queue,
                           cl_mem           src_buffer,
                           cl_mem           dst_image,
                           size_t           src_offset,
                           const size_t *   dst_origin,
                           const size_t *   region,
                           cl_uint          num_events_in_wait_list,
                           const cl_event * event_wait_list,
                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapBuffer(cl_command_queue command_queue,
                   cl_mem           buffer,
                   cl_bool          blocking_map,
                   cl_map_flags     map_flags,
                   size_t           offset,
                   size_t           size,
                   cl_uint          num_events_in_wait_list,
                   const cl_event * event_wait_list,
                   cl_event *       event,
                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapImage(cl_command_queue  command_queue,
                  cl_mem            image,
                  cl_bool           blocking_map,
                  cl_map_flags      map_flags,
                  const size_t *    origin,
                  const size_t *    region,
                  size_t *          image_row_pitch,
                  size_t *          image_slice_pitch,
                  cl_uint           num_events_in_wait_list,
                  const cl_event *  event_wait_list,
                  cl_event *        event,
                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueUnmapMemObject(cl_command_queue command_queue,
                        cl_mem           memobj,
                        void *           mapped_ptr,
                        cl_uint          num_events_in_wait_list,
                        const cl_event * event_wait_list,
                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
                           cl_uint                num_mem_objects,
                           const cl_mem *         mem_objects,
                           cl_mem_migration_flags flags,
                           cl_uint                num_events_in_wait_list,
                           const cl_event *       event_wait_list,
                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;

#endif

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNDRangeKernel(cl_command_queue command_queue,
                       cl_kernel        kernel,
                       cl_uint          work_dim,
                       const size_t *   global_work_offset,
                       const size_t *   global_work_size,
                       const size_t *   local_work_size,
                       cl_uint          num_events_in_wait_list,
                       const cl_event * event_wait_list,
                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue  command_queue,
                      void (CL_CALLBACK * user_func)(void *),
                      void *            args,
                      size_t            cb_args,
                      cl_uint           num_mem_objects,
                      const cl_mem *    mem_list,
                      const void **     args_mem_loc,
                      cl_uint           num_events_in_wait_list,
                      const cl_event *  event_wait_list,
                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;

#ifdef CL_VERSION_1_2

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMarkerWithWaitList(cl_command_queue  command_queue,
                            cl_uint           num_events_in_wait_list,
                            const cl_event *  event_wait_list,
                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueBarrierWithWaitList(cl_command_queue  command_queue,
                             cl_uint           num_events_in_wait_list,
                             const cl_event *  event_wait_list,
                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_VERSION_2_0

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFree(cl_command_queue  command_queue,
                 cl_uint           num_svm_pointers,
                 void *            svm_pointers[],
                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
                                                    cl_uint          num_svm_pointers,
                                                    void *           svm_pointers[],
                                                    void *           user_data),
                 void *            user_data,
                 cl_uint           num_events_in_wait_list,
                 const cl_event *  event_wait_list,
                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpy(cl_command_queue  command_queue,
                   cl_bool           blocking_copy,
                   void *            dst_ptr,
                   const void *      src_ptr,
                   size_t            size,
                   cl_uint           num_events_in_wait_list,
                   const cl_event *  event_wait_list,
                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFill(cl_command_queue  command_queue,
                    void *            svm_ptr,
                    const void *      pattern,
                    size_t            pattern_size,
                    size_t            size,
                    cl_uint           num_events_in_wait_list,
                    const cl_event *  event_wait_list,
                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMap(cl_command_queue  command_queue,
                cl_bool           blocking_map,
                cl_map_flags      flags,
                void *            svm_ptr,
                size_t            size,
                cl_uint           num_events_in_wait_list,
                const cl_event *  event_wait_list,
                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmap(cl_command_queue  command_queue,
                  void *            svm_ptr,
                  cl_uint           num_events_in_wait_list,
                  const cl_event *  event_wait_list,
                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;

#endif

#ifdef CL_VERSION_2_1

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMigrateMem(cl_command_queue         command_queue,
                       cl_uint                  num_svm_pointers,
                       const void **            svm_pointers,
                       const size_t *           sizes,
                       cl_mem_migration_flags   flags,
                       cl_uint                  num_events_in_wait_list,
                       const cl_event *         event_wait_list,
                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;

#endif

#ifdef CL_VERSION_1_2

/* Extension function access
 *
 * Returns the extension function address for the given function name,
 * or NULL if a valid function can not be found.  The client must
 * check to make sure the address is not NULL, before using or
 * calling the returned function address.
 */
extern CL_API_ENTRY void * CL_API_CALL
clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;

#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
    /*
     *  WARNING:
     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
     *
     *  Software developers previously relying on this API are instructed to set the command queue
     *  properties when creating the queue, instead.
     */
    extern CL_API_ENTRY cl_int CL_API_CALL
    clSetCommandQueueProperty(cl_command_queue              command_queue,
                              cl_command_queue_properties   properties,
                              cl_bool                       enable,
                              cl_command_queue_properties * old_properties) CL_API_SUFFIX__VERSION_1_0_DEPRECATED;
#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */

/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage2D(cl_context              context,
                cl_mem_flags            flags,
                const cl_image_format * image_format,
                size_t                  image_width,
                size_t                  image_height,
                size_t                  image_row_pitch,
                void *                  host_ptr,
                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage3D(cl_context              context,
                cl_mem_flags            flags,
                const cl_image_format * image_format,
                size_t                  image_width,
                size_t                  image_height,
                size_t                  image_depth,
                size_t                  image_row_pitch,
                size_t                  image_slice_pitch,
                void *                  host_ptr,
                cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueMarker(cl_command_queue    command_queue,
                cl_event *          event) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueWaitForEvents(cl_command_queue  command_queue,
                        cl_uint          num_events,
                        const cl_event * event_list) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueBarrier(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clUnloadCompiler(void) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
clGetExtensionFunctionAddress(const char * func_name) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

/* Deprecated OpenCL 2.0 APIs */
extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
clCreateCommandQueue(cl_context                     context,
                     cl_device_id                   device,
                     cl_command_queue_properties    properties,
                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
clCreateSampler(cl_context          context,
                cl_bool             normalized_coords,
                cl_addressing_mode  addressing_mode,
                cl_filter_mode      filter_mode,
                cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

extern CL_API_ENTRY CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
clEnqueueTask(cl_command_queue  command_queue,
              cl_kernel         kernel,
              cl_uint           num_events_in_wait_list,
              const cl_event *  event_wait_list,
              cl_event *        event) CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

#endif /* !defined(CL_NO_CORE_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_CL_H */


================================================
FILE: svm/OpenCL/include/CL/cl2.hpp
================================================
//
// Copyright (c) 2020 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#include <CL/opencl.hpp>
#pragma message("cl2.hpp has been renamed to opencl.hpp to make it clear that it supports all versions of OpenCL. Please include opencl.hpp directly.")


================================================
FILE: svm/OpenCL/include/CL/cl_d3d10.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_D3D10_H_
#define OPENCL_CL_D3D10_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( push )
#pragma warning( disable : 4201 )
#pragma warning( disable : 5105 )
#endif
#endif
#include <d3d10.h>
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_d3d10_sharing
***************************************************************/
#define cl_khr_d3d10_sharing 1
#define CL_KHR_D3D10_SHARING_EXTENSION_NAME \
    "cl_khr_d3d10_sharing"


#define CL_KHR_D3D10_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_uint             cl_d3d10_device_source_khr;
typedef cl_uint             cl_d3d10_device_set_khr;

/* Error codes */
#define CL_INVALID_D3D10_DEVICE_KHR                         -1002
#define CL_INVALID_D3D10_RESOURCE_KHR                       -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR              -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR                  -1005

/* cl_d3d10_device_source_khr */
#define CL_D3D10_DEVICE_KHR                                 0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR                           0x4011

/* cl_d3d10_device_set_khr */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR                  0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR                        0x4013

/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR                         0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR        0x402C

/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR                           0x4015

/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR                      0x4016

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR                0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR                0x4018


typedef cl_int CL_API_CALL
clGetDeviceIDsFromD3D10KHR_t(
    cl_platform_id platform,
    cl_d3d10_device_source_khr d3d_device_source,
    void* d3d_object,
    cl_d3d10_device_set_khr d3d_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDsFromD3D10KHR_t *
clGetDeviceIDsFromD3D10KHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_mem CL_API_CALL
clCreateFromD3D10BufferKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Buffer* resource,
    cl_int* errcode_ret);

typedef clCreateFromD3D10BufferKHR_t *
clCreateFromD3D10BufferKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_mem CL_API_CALL
clCreateFromD3D10Texture2DKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Texture2D* resource,
    UINT subresource,
    cl_int* errcode_ret);

typedef clCreateFromD3D10Texture2DKHR_t *
clCreateFromD3D10Texture2DKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_mem CL_API_CALL
clCreateFromD3D10Texture3DKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Texture3D* resource,
    UINT subresource,
    cl_int* errcode_ret);

typedef clCreateFromD3D10Texture3DKHR_t *
clCreateFromD3D10Texture3DKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueAcquireD3D10ObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireD3D10ObjectsKHR_t *
clEnqueueAcquireD3D10ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueReleaseD3D10ObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseD3D10ObjectsKHR_t *
clEnqueueReleaseD3D10ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromD3D10KHR(
    cl_platform_id platform,
    cl_d3d10_device_source_khr d3d_device_source,
    void* d3d_object,
    cl_d3d10_device_set_khr d3d_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D10BufferKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Buffer* resource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D10Texture2DKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Texture2D* resource,
    UINT subresource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D10Texture3DKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D10Texture3D* resource,
    UINT subresource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireD3D10ObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseD3D10ObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_sharing_format_query_d3d10
***************************************************************/
#define cl_intel_sharing_format_query_d3d10 1
#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_NAME \
    "cl_intel_sharing_format_query_d3d10"


#define CL_INTEL_SHARING_FORMAT_QUERY_D3D10_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* when cl_khr_d3d10_sharing is supported */

typedef cl_int CL_API_CALL
clGetSupportedD3D10TextureFormatsINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint num_entries,
    DXGI_FORMAT* d3d10_formats,
    cl_uint* num_texture_formats);

typedef clGetSupportedD3D10TextureFormatsINTEL_t *
clGetSupportedD3D10TextureFormatsINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedD3D10TextureFormatsINTEL(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint num_entries,
    DXGI_FORMAT* d3d10_formats,
    cl_uint* num_texture_formats) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_D3D10_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_d3d11.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_D3D11_H_
#define OPENCL_CL_D3D11_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( push )
#pragma warning( disable : 4201 )
#pragma warning( disable : 5105 )
#endif
#endif
#include <d3d11.h>
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_d3d11_sharing
***************************************************************/
#define cl_khr_d3d11_sharing 1
#define CL_KHR_D3D11_SHARING_EXTENSION_NAME \
    "cl_khr_d3d11_sharing"


#define CL_KHR_D3D11_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_uint             cl_d3d11_device_source_khr;
typedef cl_uint             cl_d3d11_device_set_khr;

/* Error codes */
#define CL_INVALID_D3D11_DEVICE_KHR                         -1006
#define CL_INVALID_D3D11_RESOURCE_KHR                       -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR              -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR                  -1009

/* cl_d3d11_device_source_khr */
#define CL_D3D11_DEVICE_KHR                                 0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR                           0x401A

/* cl_d3d11_device_set_khr */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR                  0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR                        0x401C

/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR                         0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR        0x402D

/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR                           0x401E

/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR                      0x401F

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR                0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR                0x4021


typedef cl_int CL_API_CALL
clGetDeviceIDsFromD3D11KHR_t(
    cl_platform_id platform,
    cl_d3d11_device_source_khr d3d_device_source,
    void* d3d_object,
    cl_d3d11_device_set_khr d3d_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDsFromD3D11KHR_t *
clGetDeviceIDsFromD3D11KHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL
clCreateFromD3D11BufferKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Buffer* resource,
    cl_int* errcode_ret);

typedef clCreateFromD3D11BufferKHR_t *
clCreateFromD3D11BufferKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL
clCreateFromD3D11Texture2DKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Texture2D* resource,
    UINT subresource,
    cl_int* errcode_ret);

typedef clCreateFromD3D11Texture2DKHR_t *
clCreateFromD3D11Texture2DKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL
clCreateFromD3D11Texture3DKHR_t(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Texture3D* resource,
    UINT subresource,
    cl_int* errcode_ret);

typedef clCreateFromD3D11Texture3DKHR_t *
clCreateFromD3D11Texture3DKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueAcquireD3D11ObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireD3D11ObjectsKHR_t *
clEnqueueAcquireD3D11ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueReleaseD3D11ObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseD3D11ObjectsKHR_t *
clEnqueueReleaseD3D11ObjectsKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromD3D11KHR(
    cl_platform_id platform,
    cl_d3d11_device_source_khr d3d_device_source,
    void* d3d_object,
    cl_d3d11_device_set_khr d3d_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D11BufferKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Buffer* resource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D11Texture2DKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Texture2D* resource,
    UINT subresource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D11Texture3DKHR(
    cl_context context,
    cl_mem_flags flags,
    ID3D11Texture3D* resource,
    UINT subresource,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireD3D11ObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseD3D11ObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_sharing_format_query_d3d11
***************************************************************/
#define cl_intel_sharing_format_query_d3d11 1
#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_NAME \
    "cl_intel_sharing_format_query_d3d11"


#define CL_INTEL_SHARING_FORMAT_QUERY_D3D11_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* when cl_khr_d3d11_sharing is supported */

typedef cl_int CL_API_CALL
clGetSupportedD3D11TextureFormatsINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    DXGI_FORMAT* d3d11_formats,
    cl_uint* num_texture_formats);

typedef clGetSupportedD3D11TextureFormatsINTEL_t *
clGetSupportedD3D11TextureFormatsINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedD3D11TextureFormatsINTEL(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    DXGI_FORMAT* d3d11_formats,
    cl_uint* num_texture_formats) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_D3D11_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_dx9_media_sharing.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_DX9_MEDIA_SHARING_H_
#define OPENCL_CL_DX9_MEDIA_SHARING_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#if defined(_WIN32)
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( push )
#pragma warning( disable : 4201 )
#pragma warning( disable : 5105 )
#endif
#endif
#include <d3d9.h>
#if defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif
#endif

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_dx9_media_sharing
***************************************************************/
#define cl_khr_dx9_media_sharing 1
#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_NAME \
    "cl_khr_dx9_media_sharing"


#define CL_KHR_DX9_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_uint             cl_dx9_media_adapter_type_khr;
typedef cl_uint             cl_dx9_media_adapter_set_khr;

#if defined(_WIN32)
typedef struct _cl_dx9_surface_info_khr {
    IDirect3DSurface9* resource;
    HANDLE shared_handle;
} cl_dx9_surface_info_khr;

#endif /* defined(_WIN32) */

/* Error codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                    -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                    -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR           -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR               -1013

/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR                                 0x2020
#define CL_ADAPTER_D3D9EX_KHR                               0x2021
#define CL_ADAPTER_DXVA_KHR                                 0x2022

/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR      0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR            0x2024

/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR                         0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                       0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR                         0x2027

/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                   0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                   0x2029

/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                        0x202A

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR           0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR           0x202C


typedef cl_int CL_API_CALL
clGetDeviceIDsFromDX9MediaAdapterKHR_t(
    cl_platform_id platform,
    cl_uint num_media_adapters,
    cl_dx9_media_adapter_type_khr* media_adapter_type,
    void* media_adapters,
    cl_dx9_media_adapter_set_khr media_adapter_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDsFromDX9MediaAdapterKHR_t *
clGetDeviceIDsFromDX9MediaAdapterKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceKHR_t(
    cl_context context,
    cl_mem_flags flags,
    cl_dx9_media_adapter_type_khr adapter_type,
    void* surface_info,
    cl_uint plane,
    cl_int* errcode_ret);

typedef clCreateFromDX9MediaSurfaceKHR_t *
clCreateFromDX9MediaSurfaceKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueAcquireDX9MediaSurfacesKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireDX9MediaSurfacesKHR_t *
clEnqueueAcquireDX9MediaSurfacesKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueReleaseDX9MediaSurfacesKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseDX9MediaSurfacesKHR_t *
clEnqueueReleaseDX9MediaSurfacesKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9MediaAdapterKHR(
    cl_platform_id platform,
    cl_uint num_media_adapters,
    cl_dx9_media_adapter_type_khr* media_adapter_type,
    void* media_adapters,
    cl_dx9_media_adapter_set_khr media_adapter_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceKHR(
    cl_context context,
    cl_mem_flags flags,
    cl_dx9_media_adapter_type_khr adapter_type,
    void* surface_info,
    cl_uint plane,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9MediaSurfacesKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9MediaSurfacesKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_dx9_media_sharing
***************************************************************/
#define cl_intel_dx9_media_sharing 1
#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_NAME \
    "cl_intel_dx9_media_sharing"


#define CL_INTEL_DX9_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_uint             cl_dx9_device_source_intel;
typedef cl_uint             cl_dx9_device_set_intel;

/* Error codes */
#define CL_INVALID_DX9_DEVICE_INTEL                         -1010
#define CL_INVALID_DX9_RESOURCE_INTEL                       -1011
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL              -1012
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL                  -1013

/* cl_dx9_device_source_intel */
#define CL_D3D9_DEVICE_INTEL                                0x4022
#define CL_D3D9EX_DEVICE_INTEL                              0x4070
#define CL_DXVA_DEVICE_INTEL                                0x4071

/* cl_dx9_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL                  0x4024
#define CL_ALL_DEVICES_FOR_DX9_INTEL                        0x4025

/* cl_context_info */
#define CL_CONTEXT_D3D9_DEVICE_INTEL                        0x4026
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                      0x4072
#define CL_CONTEXT_DXVA_DEVICE_INTEL                        0x4073

/* cl_mem_info */
#define CL_MEM_DX9_RESOURCE_INTEL                           0x4027
#define CL_MEM_DX9_SHARED_HANDLE_INTEL                      0x4074

/* cl_image_info */
#define CL_IMAGE_DX9_PLANE_INTEL                            0x4075

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL                0x402A
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL                0x402B


typedef cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL_t(
    cl_platform_id platform,
    cl_dx9_device_source_intel dx9_device_source,
    void* dx9_object,
    cl_dx9_device_set_intel dx9_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDsFromDX9INTEL_t *
clGetDeviceIDsFromDX9INTEL_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    IDirect3DSurface9* resource,
    HANDLE sharedHandle,
    UINT plane,
    cl_int* errcode_ret);

typedef clCreateFromDX9MediaSurfaceINTEL_t *
clCreateFromDX9MediaSurfaceINTEL_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireDX9ObjectsINTEL_t *
clEnqueueAcquireDX9ObjectsINTEL_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseDX9ObjectsINTEL_t *
clEnqueueReleaseDX9ObjectsINTEL_fn CL_API_SUFFIX__VERSION_1_1;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL(
    cl_platform_id platform,
    cl_dx9_device_source_intel dx9_device_source,
    void* dx9_object,
    cl_dx9_device_set_intel dx9_device_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL(
    cl_context context,
    cl_mem_flags flags,
    IDirect3DSurface9* resource,
    HANDLE sharedHandle,
    UINT plane,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL(
    cl_command_queue command_queue,
    cl_uint num_objects,
    cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_1;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_sharing_format_query_dx9
***************************************************************/
#define cl_intel_sharing_format_query_dx9 1
#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_NAME \
    "cl_intel_sharing_format_query_dx9"


#define CL_INTEL_SHARING_FORMAT_QUERY_DX9_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* when cl_khr_dx9_media_sharing or cl_intel_dx9_media_sharing is supported */

typedef cl_int CL_API_CALL
clGetSupportedDX9MediaSurfaceFormatsINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    D3DFORMAT* dx9_formats,
    cl_uint* num_surface_formats);

typedef clGetSupportedDX9MediaSurfaceFormatsINTEL_t *
clGetSupportedDX9MediaSurfaceFormatsINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedDX9MediaSurfaceFormatsINTEL(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    D3DFORMAT* dx9_formats,
    cl_uint* num_surface_formats) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_DX9_MEDIA_SHARING_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_dx9_media_sharing_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#include <CL/cl_dx9_media_sharing.h>
#pragma message("The Intel DX9 media sharing extensions have been moved into cl_dx9_media_sharing.h.  Please include cl_dx9_media_sharing.h directly.")


================================================
FILE: svm/OpenCL/include/CL/cl_egl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_EGL_H_
#define OPENCL_CL_EGL_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_egl_image
***************************************************************/
#define cl_khr_egl_image 1
#define CL_KHR_EGL_IMAGE_EXTENSION_NAME \
    "cl_khr_egl_image"


#define CL_KHR_EGL_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR                0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR                  0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR                  0x202E

/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR                           -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR                    -1092

/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void*               CLeglImageKHR;

/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void*               CLeglDisplayKHR;

/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t            cl_egl_image_properties_khr;


typedef cl_mem CL_API_CALL
clCreateFromEGLImageKHR_t(
    cl_context context,
    CLeglDisplayKHR egldisplay,
    CLeglImageKHR eglimage,
    cl_mem_flags flags,
    const cl_egl_image_properties_khr* properties,
    cl_int* errcode_ret);

typedef clCreateFromEGLImageKHR_t *
clCreateFromEGLImageKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireEGLObjectsKHR_t *
clEnqueueAcquireEGLObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseEGLObjectsKHR_t *
clEnqueueReleaseEGLObjectsKHR_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(
    cl_context context,
    CLeglDisplayKHR egldisplay,
    CLeglImageKHR eglimage,
    cl_mem_flags flags,
    const cl_egl_image_properties_khr* properties,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_egl_event
***************************************************************/
#define cl_khr_egl_event 1
#define CL_KHR_EGL_EVENT_EXTENSION_NAME \
    "cl_khr_egl_event"


#define CL_KHR_EGL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
/* type CLeglDisplayKHR */

/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void*               CLeglSyncKHR;


typedef cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR_t(
    cl_context context,
    CLeglSyncKHR sync,
    CLeglDisplayKHR display,
    cl_int* errcode_ret);

typedef clCreateEventFromEGLSyncKHR_t *
clCreateEventFromEGLSyncKHR_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(
    cl_context context,
    CLeglSyncKHR sync,
    CLeglDisplayKHR display,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_EGL_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_ext.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_EXT_H_
#define OPENCL_CL_EXT_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_command_buffer
***************************************************************/
#define cl_khr_command_buffer 1
#define CL_KHR_COMMAND_BUFFER_EXTENSION_NAME \
    "cl_khr_command_buffer"


#define CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 5)

typedef cl_bitfield         cl_device_command_buffer_capabilities_khr;
typedef struct _cl_command_buffer_khr* cl_command_buffer_khr;
typedef cl_uint             cl_sync_point_khr;
typedef cl_uint             cl_command_buffer_info_khr;
typedef cl_uint             cl_command_buffer_state_khr;
typedef cl_properties       cl_command_buffer_properties_khr;
typedef cl_bitfield         cl_command_buffer_flags_khr;
typedef cl_properties       cl_command_properties_khr;
typedef struct _cl_mutable_command_khr* cl_mutable_command_khr;

/* cl_device_info */
#define CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR           0x12A9
#define CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR 0x12AA

/* cl_device_command_buffer_capabilities_khr - bitfield */
#define CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR      (1 << 0)
#define CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR (1 << 1)
#define CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR   (1 << 2)
#define CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR       (1 << 3)

/* cl_command_buffer_properties_khr */
#define CL_COMMAND_BUFFER_FLAGS_KHR                         0x1293

/* cl_command_buffer_flags_khr - bitfield */
#define CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR              (1 << 0)

/* Error codes */
#define CL_INVALID_COMMAND_BUFFER_KHR                       -1138
#define CL_INVALID_SYNC_POINT_WAIT_LIST_KHR                 -1139
#define CL_INCOMPATIBLE_COMMAND_QUEUE_KHR                   -1140

/* cl_command_buffer_info_khr */
#define CL_COMMAND_BUFFER_QUEUES_KHR                        0x1294
#define CL_COMMAND_BUFFER_NUM_QUEUES_KHR                    0x1295
#define CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR               0x1296
#define CL_COMMAND_BUFFER_STATE_KHR                         0x1297
#define CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR              0x1298
#define CL_COMMAND_BUFFER_CONTEXT_KHR                       0x1299

/* cl_command_buffer_state_khr */
#define CL_COMMAND_BUFFER_STATE_RECORDING_KHR               0
#define CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR              1
#define CL_COMMAND_BUFFER_STATE_PENDING_KHR                 2

/* cl_command_type */
#define CL_COMMAND_COMMAND_BUFFER_KHR                       0x12A8


typedef cl_command_buffer_khr CL_API_CALL
clCreateCommandBufferKHR_t(
    cl_uint num_queues,
    const cl_command_queue* queues,
    const cl_command_buffer_properties_khr* properties,
    cl_int* errcode_ret);

typedef clCreateCommandBufferKHR_t *
clCreateCommandBufferKHR_fn ;

typedef cl_int CL_API_CALL
clFinalizeCommandBufferKHR_t(
    cl_command_buffer_khr command_buffer);

typedef clFinalizeCommandBufferKHR_t *
clFinalizeCommandBufferKHR_fn ;

typedef cl_int CL_API_CALL
clRetainCommandBufferKHR_t(
    cl_command_buffer_khr command_buffer);

typedef clRetainCommandBufferKHR_t *
clRetainCommandBufferKHR_fn ;

typedef cl_int CL_API_CALL
clReleaseCommandBufferKHR_t(
    cl_command_buffer_khr command_buffer);

typedef clReleaseCommandBufferKHR_t *
clReleaseCommandBufferKHR_fn ;

typedef cl_int CL_API_CALL
clEnqueueCommandBufferKHR_t(
    cl_uint num_queues,
    cl_command_queue* queues,
    cl_command_buffer_khr command_buffer,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCommandBufferKHR_t *
clEnqueueCommandBufferKHR_fn ;

typedef cl_int CL_API_CALL
clCommandBarrierWithWaitListKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandBarrierWithWaitListKHR_t *
clCommandBarrierWithWaitListKHR_fn ;

typedef cl_int CL_API_CALL
clCommandCopyBufferKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    size_t src_offset,
    size_t dst_offset,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandCopyBufferKHR_t *
clCommandCopyBufferKHR_fn ;

typedef cl_int CL_API_CALL
clCommandCopyBufferRectKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    size_t src_row_pitch,
    size_t src_slice_pitch,
    size_t dst_row_pitch,
    size_t dst_slice_pitch,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandCopyBufferRectKHR_t *
clCommandCopyBufferRectKHR_fn ;

typedef cl_int CL_API_CALL
clCommandCopyBufferToImageKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_image,
    size_t src_offset,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandCopyBufferToImageKHR_t *
clCommandCopyBufferToImageKHR_fn ;

typedef cl_int CL_API_CALL
clCommandCopyImageKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_image,
    cl_mem dst_image,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandCopyImageKHR_t *
clCommandCopyImageKHR_fn ;

typedef cl_int CL_API_CALL
clCommandCopyImageToBufferKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_image,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* region,
    size_t dst_offset,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandCopyImageToBufferKHR_t *
clCommandCopyImageToBufferKHR_fn ;

typedef cl_int CL_API_CALL
clCommandFillBufferKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem buffer,
    const void* pattern,
    size_t pattern_size,
    size_t offset,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandFillBufferKHR_t *
clCommandFillBufferKHR_fn ;

typedef cl_int CL_API_CALL
clCommandFillImageKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem image,
    const void* fill_color,
    const size_t* origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandFillImageKHR_t *
clCommandFillImageKHR_fn ;

typedef cl_int CL_API_CALL
clCommandNDRangeKernelKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t* global_work_offset,
    const size_t* global_work_size,
    const size_t* local_work_size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandNDRangeKernelKHR_t *
clCommandNDRangeKernelKHR_fn ;

typedef cl_int CL_API_CALL
clGetCommandBufferInfoKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_buffer_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetCommandBufferInfoKHR_t *
clGetCommandBufferInfoKHR_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
clCreateCommandBufferKHR(
    cl_uint num_queues,
    const cl_command_queue* queues,
    const cl_command_buffer_properties_khr* properties,
    cl_int* errcode_ret) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clFinalizeCommandBufferKHR(
    cl_command_buffer_khr command_buffer) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandBufferKHR(
    cl_command_buffer_khr command_buffer) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandBufferKHR(
    cl_command_buffer_khr command_buffer) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCommandBufferKHR(
    cl_uint num_queues,
    cl_command_queue* queues,
    cl_command_buffer_khr command_buffer,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandBarrierWithWaitListKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandCopyBufferKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    size_t src_offset,
    size_t dst_offset,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandCopyBufferRectKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    size_t src_row_pitch,
    size_t src_slice_pitch,
    size_t dst_row_pitch,
    size_t dst_slice_pitch,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandCopyBufferToImageKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_buffer,
    cl_mem dst_image,
    size_t src_offset,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandCopyImageKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_image,
    cl_mem dst_image,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandCopyImageToBufferKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem src_image,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* region,
    size_t dst_offset,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandFillBufferKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem buffer,
    const void* pattern,
    size_t pattern_size,
    size_t offset,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandFillImageKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_mem image,
    const void* fill_color,
    const size_t* origin,
    const size_t* region,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandNDRangeKernelKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t* global_work_offset,
    const size_t* global_work_size,
    const size_t* local_work_size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandBufferInfoKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_buffer_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/* From version 0.9.4 of the extension */

typedef cl_int CL_API_CALL
clCommandSVMMemcpyKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandSVMMemcpyKHR_t *
clCommandSVMMemcpyKHR_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL
clCommandSVMMemFillKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    void* svm_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle);

typedef clCommandSVMMemFillKHR_t *
clCommandSVMMemFillKHR_fn CL_API_SUFFIX__VERSION_2_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandSVMMemcpyKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) CL_API_SUFFIX__VERSION_2_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clCommandSVMMemFillKHR(
    cl_command_buffer_khr command_buffer,
    cl_command_queue command_queue,
    const cl_command_properties_khr* properties,
    void* svm_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_sync_points_in_wait_list,
    const cl_sync_point_khr* sync_point_wait_list,
    cl_sync_point_khr* sync_point,
    cl_mutable_command_khr* mutable_handle) CL_API_SUFFIX__VERSION_2_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_command_buffer_multi_device
***************************************************************/
#define cl_khr_command_buffer_multi_device 1
#define CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME \
    "cl_khr_command_buffer_multi_device"


#define CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 1)

typedef cl_bitfield         cl_platform_command_buffer_capabilities_khr;

/* cl_platform_info */
#define CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR         0x0908

/* cl_platform_command_buffer_capabilities_khr - bitfield */
#define CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR       (1 << 0)
#define CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR         (1 << 1)
#define CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR      (1 << 2)

/* cl_device_info */
#define CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR       0x12AB
#define CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR           0x12AC

/* cl_device_command_buffer_capabilities_khr - bitfield */
#define CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR     (1 << 4)

/* cl_command_buffer_flags_khr - bitfield */
#define CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR              (1 << 2)


typedef cl_command_buffer_khr CL_API_CALL
clRemapCommandBufferKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_bool automatic,
    cl_uint num_queues,
    const cl_command_queue* queues,
    cl_uint num_handles,
    const cl_mutable_command_khr* handles,
    cl_mutable_command_khr* handles_ret,
    cl_int* errcode_ret);

typedef clRemapCommandBufferKHR_t *
clRemapCommandBufferKHR_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_command_buffer_khr CL_API_CALL
clRemapCommandBufferKHR(
    cl_command_buffer_khr command_buffer,
    cl_bool automatic,
    cl_uint num_queues,
    const cl_command_queue* queues,
    cl_uint num_handles,
    const cl_mutable_command_khr* handles,
    cl_mutable_command_khr* handles_ret,
    cl_int* errcode_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_command_buffer_mutable_dispatch
***************************************************************/
#define cl_khr_command_buffer_mutable_dispatch 1
#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME \
    "cl_khr_command_buffer_mutable_dispatch"


#define CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 3)

typedef cl_uint             cl_command_buffer_update_type_khr;
typedef cl_bitfield         cl_mutable_dispatch_fields_khr;
typedef cl_uint             cl_mutable_command_info_khr;
typedef struct _cl_mutable_dispatch_arg_khr {
    cl_uint arg_index;
    size_t arg_size;
    const void* arg_value;
} cl_mutable_dispatch_arg_khr;
typedef struct _cl_mutable_dispatch_exec_info_khr {
    cl_uint param_name;
    size_t param_value_size;
    const void* param_value;
} cl_mutable_dispatch_exec_info_khr;
typedef struct _cl_mutable_dispatch_config_khr {
    cl_mutable_command_khr command;
    cl_uint num_args;
    cl_uint num_svm_args;
    cl_uint num_exec_infos;
    cl_uint work_dim;
    const cl_mutable_dispatch_arg_khr* arg_list;
    const cl_mutable_dispatch_arg_khr* arg_svm_list;
    const cl_mutable_dispatch_exec_info_khr* exec_info_list;
    const size_t* global_work_offset;
    const size_t* global_work_size;
    const size_t* local_work_size;
} cl_mutable_dispatch_config_khr;
typedef cl_bitfield         cl_mutable_dispatch_asserts_khr;

/* cl_command_buffer_flags_khr - bitfield */
#define CL_COMMAND_BUFFER_MUTABLE_KHR                       (1 << 1)

/* Error codes */
#define CL_INVALID_MUTABLE_COMMAND_KHR                      -1141

/* cl_device_info */
#define CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR         0x12B0

/* cl_command_properties_khr */
#define CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR            0x12B1

/* cl_mutable_dispatch_fields_khr - bitfield */
#define CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR               (1 << 0)
#define CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR                 (1 << 1)
#define CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR                  (1 << 2)
#define CL_MUTABLE_DISPATCH_ARGUMENTS_KHR                   (1 << 3)
#define CL_MUTABLE_DISPATCH_EXEC_INFO_KHR                   (1 << 4)

/* cl_mutable_command_info_khr */
#define CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR                0x12A0
#define CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR               0x12A1
#define CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR                 0x12AD
#define CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR             0x12A2
#define CL_MUTABLE_DISPATCH_KERNEL_KHR                      0x12A3
#define CL_MUTABLE_DISPATCH_DIMENSIONS_KHR                  0x12A4
#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR          0x12A5
#define CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR            0x12A6
#define CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR             0x12A7

/* cl_command_buffer_update_type_khr */
#define CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR       0

/* cl_command_buffer_properties_khr */
#define CL_COMMAND_BUFFER_MUTABLE_DISPATCH_ASSERTS_KHR      0x12B7

/* cl_command_properties_khr */
#define CL_MUTABLE_DISPATCH_ASSERTS_KHR                     0x12B8

/* cl_mutable_dispatch_asserts_khr - bitfield */
#define CL_MUTABLE_DISPATCH_ASSERT_NO_ADDITIONAL_WORK_GROUPS_KHR (1 << 0)


typedef cl_int CL_API_CALL
clUpdateMutableCommandsKHR_t(
    cl_command_buffer_khr command_buffer,
    cl_uint num_configs,
    const cl_command_buffer_update_type_khr* config_types,
    const void** configs);

typedef clUpdateMutableCommandsKHR_t *
clUpdateMutableCommandsKHR_fn ;

typedef cl_int CL_API_CALL
clGetMutableCommandInfoKHR_t(
    cl_mutable_command_khr command,
    cl_mutable_command_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetMutableCommandInfoKHR_t *
clGetMutableCommandInfoKHR_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clUpdateMutableCommandsKHR(
    cl_command_buffer_khr command_buffer,
    cl_uint num_configs,
    const cl_command_buffer_update_type_khr* config_types,
    const void** configs) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetMutableCommandInfoKHR(
    cl_mutable_command_khr command,
    cl_mutable_command_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_fp64
***************************************************************/
#define cl_khr_fp64 1
#define CL_KHR_FP64_EXTENSION_NAME \
    "cl_khr_fp64"


#define CL_KHR_FP64_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

#if !defined(CL_VERSION_1_2)
/* cl_device_info - defined in CL.h for OpenCL 1.2 and newer */
#define CL_DEVICE_DOUBLE_FP_CONFIG                          0x1032

#endif /* !defined(CL_VERSION_1_2) */

/***************************************************************
* cl_khr_fp16
***************************************************************/
#define cl_khr_fp16 1
#define CL_KHR_FP16_EXTENSION_NAME \
    "cl_khr_fp16"


#define CL_KHR_FP16_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_HALF_FP_CONFIG                            0x1033

/***************************************************************
* cl_APPLE_SetMemObjectDestructor
***************************************************************/
#define cl_APPLE_SetMemObjectDestructor 1
#define CL_APPLE_SETMEMOBJECTDESTRUCTOR_EXTENSION_NAME \
    "cl_APPLE_SetMemObjectDestructor"


#define CL_APPLE_SETMEMOBJECTDESTRUCTOR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)


typedef cl_int CL_API_CALL
clSetMemObjectDestructorAPPLE_t(
    cl_mem memobj,
    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),
    void* user_data);

typedef clSetMemObjectDestructorAPPLE_t *
clSetMemObjectDestructorAPPLE_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clSetMemObjectDestructorAPPLE(
    cl_mem memobj,
    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),
    void* user_data) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_APPLE_ContextLoggingFunctions
***************************************************************/
#define cl_APPLE_ContextLoggingFunctions 1
#define CL_APPLE_CONTEXTLOGGINGFUNCTIONS_EXTENSION_NAME \
    "cl_APPLE_ContextLoggingFunctions"


#define CL_APPLE_CONTEXTLOGGINGFUNCTIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)


typedef void CL_API_CALL
clLogMessagesToSystemLogAPPLE_t(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data);

typedef clLogMessagesToSystemLogAPPLE_t *
clLogMessagesToSystemLogAPPLE_fn CL_API_SUFFIX__VERSION_1_0;

typedef void CL_API_CALL
clLogMessagesToStdoutAPPLE_t(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data);

typedef clLogMessagesToStdoutAPPLE_t *
clLogMessagesToStdoutAPPLE_fn CL_API_SUFFIX__VERSION_1_0;

typedef void CL_API_CALL
clLogMessagesToStderrAPPLE_t(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data);

typedef clLogMessagesToStderrAPPLE_t *
clLogMessagesToStderrAPPLE_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY void CL_API_CALL
clLogMessagesToSystemLogAPPLE(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void CL_API_CALL
clLogMessagesToStdoutAPPLE(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY void CL_API_CALL
clLogMessagesToStderrAPPLE(
    const char* errstr,
    const void* private_info,
    size_t cb,
    void* user_data) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_icd
***************************************************************/
#define cl_khr_icd 1
#define CL_KHR_ICD_EXTENSION_NAME \
    "cl_khr_icd"


#define CL_KHR_ICD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR                          0x0920

/* Error codes */
#define CL_PLATFORM_NOT_FOUND_KHR                           -1001


typedef cl_int CL_API_CALL
clIcdGetPlatformIDsKHR_t(
    cl_uint num_entries,
    cl_platform_id* platforms,
    cl_uint* num_platforms);

typedef clIcdGetPlatformIDsKHR_t *
clIcdGetPlatformIDsKHR_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(
    cl_uint num_entries,
    cl_platform_id* platforms,
    cl_uint* num_platforms) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_il_program
***************************************************************/
#define cl_khr_il_program 1
#define CL_KHR_IL_PROGRAM_EXTENSION_NAME \
    "cl_khr_il_program"


#define CL_KHR_IL_PROGRAM_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_IL_VERSION_KHR                            0x105B

/* cl_program_info */
#define CL_PROGRAM_IL_KHR                                   0x1169


typedef cl_program CL_API_CALL
clCreateProgramWithILKHR_t(
    cl_context context,
    const void* il,
    size_t length,
    cl_int* errcode_ret);

typedef clCreateProgramWithILKHR_t *
clCreateProgramWithILKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithILKHR(
    cl_context context,
    const void* il,
    size_t length,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_image2d_from_buffer
***************************************************************/
#define cl_khr_image2d_from_buffer 1
#define CL_KHR_IMAGE2D_FROM_BUFFER_EXTENSION_NAME \
    "cl_khr_image2d_from_buffer"


#define CL_KHR_IMAGE2D_FROM_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR                 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR          0x104B

/***************************************************************
* cl_khr_initialize_memory
***************************************************************/
#define cl_khr_initialize_memory 1
#define CL_KHR_INITIALIZE_MEMORY_EXTENSION_NAME \
    "cl_khr_initialize_memory"


#define CL_KHR_INITIALIZE_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_bitfield         cl_context_memory_initialize_khr;

/* cl_context_properties */
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR                    0x2030

/* cl_context_memory_initialize_khr */
#define CL_CONTEXT_MEMORY_INITIALIZE_LOCAL_KHR              (1 << 0)
#define CL_CONTEXT_MEMORY_INITIALIZE_PRIVATE_KHR            (1 << 1)

/***************************************************************
* cl_khr_terminate_context
***************************************************************/
#define cl_khr_terminate_context 1
#define CL_KHR_TERMINATE_CONTEXT_EXTENSION_NAME \
    "cl_khr_terminate_context"


#define CL_KHR_TERMINATE_CONTEXT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_bitfield         cl_device_terminate_capability_khr;

/* cl_device_info */
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR                  0x2031

/* cl_context_properties */
#define CL_CONTEXT_TERMINATE_KHR                            0x2032

/* cl_device_terminate_capability_khr */
#define CL_DEVICE_TERMINATE_CAPABILITY_CONTEXT_KHR          (1 << 0)

/* Error codes */
#define CL_CONTEXT_TERMINATED_KHR                           -1121


typedef cl_int CL_API_CALL
clTerminateContextKHR_t(
    cl_context context);

typedef clTerminateContextKHR_t *
clTerminateContextKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clTerminateContextKHR(
    cl_context context) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_spir
***************************************************************/
#define cl_khr_spir 1
#define CL_KHR_SPIR_EXTENSION_NAME \
    "cl_khr_spir"


#define CL_KHR_SPIR_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_SPIR_VERSIONS                             0x40E0

/* cl_program_binary_type */
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE                 0x40E1

/***************************************************************
* cl_khr_create_command_queue
***************************************************************/
#define cl_khr_create_command_queue 1
#define CL_KHR_CREATE_COMMAND_QUEUE_EXTENSION_NAME \
    "cl_khr_create_command_queue"


#define CL_KHR_CREATE_COMMAND_QUEUE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_properties       cl_queue_properties_khr;


typedef cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR_t(
    cl_context context,
    cl_device_id device,
    const cl_queue_properties_khr* properties,
    cl_int* errcode_ret);

typedef clCreateCommandQueueWithPropertiesKHR_t *
clCreateCommandQueueWithPropertiesKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR(
    cl_context context,
    cl_device_id device,
    const cl_queue_properties_khr* properties,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_nv_device_attribute_query
***************************************************************/
#define cl_nv_device_attribute_query 1
#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
    "cl_nv_device_attribute_query"


#define CL_NV_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV               0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV               0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV                    0x4002
#define CL_DEVICE_WARP_SIZE_NV                              0x4003
#define CL_DEVICE_GPU_OVERLAP_NV                            0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV                    0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV                      0x4006

/***************************************************************
* cl_amd_device_attribute_query
***************************************************************/
#define cl_amd_device_attribute_query 1
#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
    "cl_amd_device_attribute_query"


#define CL_AMD_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD                0x4036
#define CL_DEVICE_TOPOLOGY_AMD                              0x4037
#define CL_DEVICE_BOARD_NAME_AMD                            0x4038
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                    0x4039
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD                 0x4040
#define CL_DEVICE_SIMD_WIDTH_AMD                            0x4041
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD                0x4042
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                       0x4043
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD                   0x4044
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD              0x4045
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD         0x4046
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD       0x4047
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                       0x4048
#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD                0x4049
#define CL_DEVICE_GFXIP_MAJOR_AMD                           0x404A
#define CL_DEVICE_GFXIP_MINOR_AMD                           0x404B
#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD                0x404C
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD             0x4030
#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD                   0x4031
#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD        0x4033
#define CL_DEVICE_PCIE_ID_AMD                               0x4034

/***************************************************************
* cl_arm_printf
***************************************************************/
#define cl_arm_printf 1
#define CL_ARM_PRINTF_EXTENSION_NAME \
    "cl_arm_printf"


#define CL_ARM_PRINTF_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_context_properties */
#define CL_PRINTF_CALLBACK_ARM                              0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM                            0x40B1

/***************************************************************
* cl_ext_device_fission
***************************************************************/
#define cl_ext_device_fission 1
#define CL_EXT_DEVICE_FISSION_EXTENSION_NAME \
    "cl_ext_device_fission"


#define CL_EXT_DEVICE_FISSION_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_ulong            cl_device_partition_property_ext;

/* Error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT                      -1057
#define CL_INVALID_PARTITION_COUNT_EXT                      -1058
#define CL_INVALID_PARTITION_NAME_EXT                       -1059

/* cl_device_info */
#define CL_DEVICE_PARENT_DEVICE_EXT                         0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT                       0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT                      0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT                       0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT                       0x4058

/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT                     0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT                   0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT                    0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT          0x4053

/* cl_device_partition_property_ext - affinity domains */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT                     0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT                     0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT                     0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT                     0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT                         0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT             0x100

/* cl_device_partition_property_ext - list terminators */
#define CL_PROPERTIES_LIST_END_EXT                          ((cl_device_partition_property_ext)0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT                 ((cl_device_partition_property_ext)0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT                  ((cl_device_partition_property_ext)0 - 1)


typedef cl_int CL_API_CALL
clReleaseDeviceEXT_t(
    cl_device_id device);

typedef clReleaseDeviceEXT_t *
clReleaseDeviceEXT_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL
clRetainDeviceEXT_t(
    cl_device_id device);

typedef clRetainDeviceEXT_t *
clRetainDeviceEXT_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL
clCreateSubDevicesEXT_t(
    cl_device_id in_device,
    const cl_device_partition_property_ext* properties,
    cl_uint num_entries,
    cl_device_id* out_devices,
    cl_uint* num_devices);

typedef clCreateSubDevicesEXT_t *
clCreateSubDevicesEXT_fn CL_API_SUFFIX__VERSION_1_1;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT(
    cl_device_id device) CL_API_SUFFIX__VERSION_1_1;

extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT(
    cl_device_id in_device,
    const cl_device_partition_property_ext* properties,
    cl_uint num_entries,
    cl_device_id* out_devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_1;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_ext_migrate_memobject
***************************************************************/
#define cl_ext_migrate_memobject 1
#define CL_EXT_MIGRATE_MEMOBJECT_EXTENSION_NAME \
    "cl_ext_migrate_memobject"


#define CL_EXT_MIGRATE_MEMOBJECT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_bitfield         cl_mem_migration_flags_ext;

/* cl_mem_migration_flags_ext */
#define CL_MIGRATE_MEM_OBJECT_HOST_EXT                      (1 << 0)

/* cl_command_type */
#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT                   0x4040


typedef cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT_t(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_mem_migration_flags_ext flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMigrateMemObjectEXT_t *
clEnqueueMigrateMemObjectEXT_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_mem_migration_flags_ext flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_ext_cxx_for_opencl
***************************************************************/
#define cl_ext_cxx_for_opencl 1
#define CL_EXT_CXX_FOR_OPENCL_EXTENSION_NAME \
    "cl_ext_cxx_for_opencl"


#define CL_EXT_CXX_FOR_OPENCL_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT        0x4230

/***************************************************************
* cl_qcom_ext_host_ptr
***************************************************************/
#define cl_qcom_ext_host_ptr 1
#define CL_QCOM_EXT_HOST_PTR_EXTENSION_NAME \
    "cl_qcom_ext_host_ptr"


#define CL_QCOM_EXT_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_uint             cl_image_pitch_info_qcom;
typedef struct _cl_mem_ext_host_ptr {
    cl_uint allocation_type;
    cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;

/* cl_mem_flags */
#define CL_MEM_EXT_HOST_PTR_QCOM                            (1 << 29)

/* cl_device_info */
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM             0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM                            0x40A1

/* cl_image_pitch_info_qcom */
#define CL_IMAGE_ROW_ALIGNMENT_QCOM                         0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM                       0x40A3

/* cl_uint host_cache_policy */
#define CL_MEM_HOST_UNCACHED_QCOM                           0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM                          0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM                       0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM                    0x40A7


typedef cl_int CL_API_CALL
clGetDeviceImageInfoQCOM_t(
    cl_device_id device,
    size_t image_width,
    size_t image_height,
    const cl_image_format* image_format,
    cl_image_pitch_info_qcom param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetDeviceImageInfoQCOM_t *
clGetDeviceImageInfoQCOM_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(
    cl_device_id device,
    size_t image_width,
    size_t image_height,
    const cl_image_format* image_format,
    cl_image_pitch_info_qcom param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_qcom_ext_host_ptr_iocoherent
***************************************************************/
#define cl_qcom_ext_host_ptr_iocoherent 1
#define CL_QCOM_EXT_HOST_PTR_IOCOHERENT_EXTENSION_NAME \
    "cl_qcom_ext_host_ptr_iocoherent"


#define CL_QCOM_EXT_HOST_PTR_IOCOHERENT_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_uint host_cache_policy */
#define CL_MEM_HOST_IOCOHERENT_QCOM                         0x40A9

/***************************************************************
* cl_qcom_ion_host_ptr
***************************************************************/
#define cl_qcom_ion_host_ptr 1
#define CL_QCOM_ION_HOST_PTR_EXTENSION_NAME \
    "cl_qcom_ion_host_ptr"


#define CL_QCOM_ION_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* type cl_mem_ext_host_ptr */
typedef struct _cl_mem_ion_host_ptr {
    cl_mem_ext_host_ptr ext_host_ptr;
    int ion_filedesc;
    void* ion_hostptr;
} cl_mem_ion_host_ptr;

/* cl_uint allocation_type */
#define CL_MEM_ION_HOST_PTR_QCOM                            0x40A8

/***************************************************************
* cl_qcom_android_native_buffer_host_ptr
***************************************************************/
#define cl_qcom_android_native_buffer_host_ptr 1
#define CL_QCOM_ANDROID_NATIVE_BUFFER_HOST_PTR_EXTENSION_NAME \
    "cl_qcom_android_native_buffer_host_ptr"


#define CL_QCOM_ANDROID_NATIVE_BUFFER_HOST_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* type cl_mem_ext_host_ptr */
typedef struct _cl_mem_android_native_buffer_host_ptr {
    cl_mem_ext_host_ptr ext_host_ptr;
    void* anb_ptr;
} cl_mem_android_native_buffer_host_ptr;

/* cl_uint allocation_type */
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM          0x40C6

/***************************************************************
* cl_img_yuv_image
***************************************************************/
#define cl_img_yuv_image 1
#define CL_IMG_YUV_IMAGE_EXTENSION_NAME \
    "cl_img_yuv_image"


#define CL_IMG_YUV_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_channel_order */
#define CL_NV21_IMG                                         0x40D0
#define CL_YV12_IMG                                         0x40D1

/***************************************************************
* cl_img_cached_allocations
***************************************************************/
#define cl_img_cached_allocations 1
#define CL_IMG_CACHED_ALLOCATIONS_EXTENSION_NAME \
    "cl_img_cached_allocations"


#define CL_IMG_CACHED_ALLOCATIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_flags */
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG                  (1 << 26)
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG                    (1 << 27)

/***************************************************************
* cl_img_use_gralloc_ptr
***************************************************************/
#define cl_img_use_gralloc_ptr 1
#define CL_IMG_USE_GRALLOC_PTR_EXTENSION_NAME \
    "cl_img_use_gralloc_ptr"


#define CL_IMG_USE_GRALLOC_PTR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* Error codes */
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG                0x40D4
#define CL_INVALID_GRALLOC_OBJECT_IMG                       0x40D5

/* cl_mem_flags */
#define CL_MEM_USE_GRALLOC_PTR_IMG                          (1 << 28)

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG              0x40D2
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG              0x40D3


typedef cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireGrallocObjectsIMG_t *
clEnqueueAcquireGrallocObjectsIMG_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseGrallocObjectsIMG_t *
clEnqueueReleaseGrallocObjectsIMG_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_img_generate_mipmap
***************************************************************/
#define cl_img_generate_mipmap 1
#define CL_IMG_GENERATE_MIPMAP_EXTENSION_NAME \
    "cl_img_generate_mipmap"


#define CL_IMG_GENERATE_MIPMAP_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_uint             cl_mipmap_filter_mode_img;

/* cl_mipmap_filter_mode_img */
#define CL_MIPMAP_FILTER_ANY_IMG                            0x0
#define CL_MIPMAP_FILTER_BOX_IMG                            0x1

/* cl_command_type */
#define CL_COMMAND_GENERATE_MIPMAP_IMG                      0x40D6


typedef cl_int CL_API_CALL
clEnqueueGenerateMipmapIMG_t(
    cl_command_queue command_queue,
    cl_mem src_image,
    cl_mem dst_image,
    cl_mipmap_filter_mode_img mipmap_filter_mode,
    const size_t* array_region,
    const size_t* mip_region,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueGenerateMipmapIMG_t *
clEnqueueGenerateMipmapIMG_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueGenerateMipmapIMG(
    cl_command_queue command_queue,
    cl_mem src_image,
    cl_mem dst_image,
    cl_mipmap_filter_mode_img mipmap_filter_mode,
    const size_t* array_region,
    const size_t* mip_region,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_img_mem_properties
***************************************************************/
#define cl_img_mem_properties 1
#define CL_IMG_MEM_PROPERTIES_EXTENSION_NAME \
    "cl_img_mem_properties"


#define CL_IMG_MEM_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_properties */
#define CL_MEM_ALLOC_FLAGS_IMG                              0x40D7

/* cl_mem_alloc_flags_img */
#define CL_MEM_ALLOC_RELAX_REQUIREMENTS_IMG                 (1 << 0)
#define CL_MEM_ALLOC_GPU_WRITE_COMBINE_IMG                  (1 << 1)
#define CL_MEM_ALLOC_GPU_CACHED_IMG                         (1 << 2)
#define CL_MEM_ALLOC_CPU_LOCAL_IMG                          (1 << 3)
#define CL_MEM_ALLOC_GPU_LOCAL_IMG                          (1 << 4)
#define CL_MEM_ALLOC_GPU_PRIVATE_IMG                        (1 << 5)

/* cl_device_info */
#define CL_DEVICE_MEMORY_CAPABILITIES_IMG                   0x40D8

/***************************************************************
* cl_khr_subgroups
***************************************************************/
#define cl_khr_subgroups 1
#define CL_KHR_SUBGROUPS_EXTENSION_NAME \
    "cl_khr_subgroups"


#define CL_KHR_SUBGROUPS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

#if !defined(CL_VERSION_2_1)
/* defined in CL.h for OpenCL 2.1 and newer */
typedef cl_uint             cl_kernel_sub_group_info;

#endif /* !defined(CL_VERSION_2_1) */

/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR        0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR           0x2034


typedef cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR_t(
    cl_kernel in_kernel,
    cl_device_id in_device,
    cl_kernel_sub_group_info param_name,
    size_t input_value_size,
    const void* input_value,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetKernelSubGroupInfoKHR_t *
clGetKernelSubGroupInfoKHR_fn CL_API_SUFFIX__VERSION_2_0_DEPRECATED;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR(
    cl_kernel in_kernel,
    cl_device_id in_device,
    cl_kernel_sub_group_info param_name,
    size_t input_value_size,
    const void* input_value,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_0_DEPRECATED;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_mipmap_image
***************************************************************/
#define cl_khr_mipmap_image 1
#define CL_KHR_MIPMAP_IMAGE_EXTENSION_NAME \
    "cl_khr_mipmap_image"


#define CL_KHR_MIPMAP_IMAGE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_sampler_properties */
#define CL_SAMPLER_MIP_FILTER_MODE_KHR                      0x1155
#define CL_SAMPLER_LOD_MIN_KHR                              0x1156
#define CL_SAMPLER_LOD_MAX_KHR                              0x1157

/***************************************************************
* cl_khr_priority_hints
***************************************************************/
#define cl_khr_priority_hints 1
#define CL_KHR_PRIORITY_HINTS_EXTENSION_NAME \
    "cl_khr_priority_hints"


#define CL_KHR_PRIORITY_HINTS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* To be used by clGetEventInfo */
typedef cl_uint             cl_queue_priority_khr;

/* cl_queue_properties */
#define CL_QUEUE_PRIORITY_KHR                               0x1096

/* cl_queue_priority_khr */
#define CL_QUEUE_PRIORITY_HIGH_KHR                          (1 << 0)
#define CL_QUEUE_PRIORITY_MED_KHR                           (1 << 1)
#define CL_QUEUE_PRIORITY_LOW_KHR                           (1 << 2)

/***************************************************************
* cl_khr_throttle_hints
***************************************************************/
#define cl_khr_throttle_hints 1
#define CL_KHR_THROTTLE_HINTS_EXTENSION_NAME \
    "cl_khr_throttle_hints"


#define CL_KHR_THROTTLE_HINTS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* To be used by clGetEventInfo */
typedef cl_uint             cl_queue_throttle_khr;

/* cl_queue_properties */
#define CL_QUEUE_THROTTLE_KHR                               0x1097

/* cl_queue_throttle_khr */
#define CL_QUEUE_THROTTLE_HIGH_KHR                          (1 << 0)
#define CL_QUEUE_THROTTLE_MED_KHR                           (1 << 1)
#define CL_QUEUE_THROTTLE_LOW_KHR                           (1 << 2)

/***************************************************************
* cl_khr_subgroup_named_barrier
***************************************************************/
#define cl_khr_subgroup_named_barrier 1
#define CL_KHR_SUBGROUP_NAMED_BARRIER_EXTENSION_NAME \
    "cl_khr_subgroup_named_barrier"


#define CL_KHR_SUBGROUP_NAMED_BARRIER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_device_info */
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR               0x2035

/***************************************************************
* cl_khr_extended_versioning
***************************************************************/
#define cl_khr_extended_versioning 1
#define CL_KHR_EXTENDED_VERSIONING_EXTENSION_NAME \
    "cl_khr_extended_versioning"


#define CL_KHR_EXTENDED_VERSIONING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

#define CL_VERSION_MAJOR_BITS_KHR                           10
#define CL_VERSION_MINOR_BITS_KHR                           10
#define CL_VERSION_PATCH_BITS_KHR                           12

#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)

#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)

#define CL_MAKE_VERSION_KHR(major, minor, patch) \
    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
    ((patch) & CL_VERSION_PATCH_MASK_KHR))

#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR                   64

typedef cl_uint             cl_version_khr;
typedef struct _cl_name_version_khr {
    cl_version_khr version;
    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
} cl_name_version_khr;

/* cl_platform_info */
#define CL_PLATFORM_NUMERIC_VERSION_KHR                     0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR             0x0907

/* cl_device_info */
#define CL_DEVICE_NUMERIC_VERSION_KHR                       0x105E
#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR              0x105F
#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR               0x1060
#define CL_DEVICE_ILS_WITH_VERSION_KHR                      0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR         0x1062

/***************************************************************
* cl_khr_device_uuid
***************************************************************/
#define cl_khr_device_uuid 1
#define CL_KHR_DEVICE_UUID_EXTENSION_NAME \
    "cl_khr_device_uuid"


#define CL_KHR_DEVICE_UUID_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* Size Constants */
#define CL_UUID_SIZE_KHR                                    16
#define CL_LUID_SIZE_KHR                                    8

/* cl_device_info */
#define CL_DEVICE_UUID_KHR                                  0x106A
#define CL_DRIVER_UUID_KHR                                  0x106B
#define CL_DEVICE_LUID_VALID_KHR                            0x106C
#define CL_DEVICE_LUID_KHR                                  0x106D
#define CL_DEVICE_NODE_MASK_KHR                             0x106E

/***************************************************************
* cl_khr_pci_bus_info
***************************************************************/
#define cl_khr_pci_bus_info 1
#define CL_KHR_PCI_BUS_INFO_EXTENSION_NAME \
    "cl_khr_pci_bus_info"


#define CL_KHR_PCI_BUS_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef struct _cl_device_pci_bus_info_khr {
    cl_uint pci_domain;
    cl_uint pci_bus;
    cl_uint pci_device;
    cl_uint pci_function;
} cl_device_pci_bus_info_khr;

/* cl_device_info */
#define CL_DEVICE_PCI_BUS_INFO_KHR                          0x410F

/***************************************************************
* cl_khr_suggested_local_work_size
***************************************************************/
#define cl_khr_suggested_local_work_size 1
#define CL_KHR_SUGGESTED_LOCAL_WORK_SIZE_EXTENSION_NAME \
    "cl_khr_suggested_local_work_size"


#define CL_KHR_SUGGESTED_LOCAL_WORK_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)


typedef cl_int CL_API_CALL
clGetKernelSuggestedLocalWorkSizeKHR_t(
    cl_command_queue command_queue,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t* global_work_offset,
    const size_t* global_work_size,
    size_t* suggested_local_work_size);

typedef clGetKernelSuggestedLocalWorkSizeKHR_t *
clGetKernelSuggestedLocalWorkSizeKHR_fn CL_API_SUFFIX__VERSION_3_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSuggestedLocalWorkSizeKHR(
    cl_command_queue command_queue,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t* global_work_offset,
    const size_t* global_work_size,
    size_t* suggested_local_work_size) CL_API_SUFFIX__VERSION_3_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_integer_dot_product
***************************************************************/
#define cl_khr_integer_dot_product 1
#define CL_KHR_INTEGER_DOT_PRODUCT_EXTENSION_NAME \
    "cl_khr_integer_dot_product"


#define CL_KHR_INTEGER_DOT_PRODUCT_EXTENSION_VERSION CL_MAKE_VERSION(2, 0, 0)

typedef cl_bitfield         cl_device_integer_dot_product_capabilities_khr;
typedef struct _cl_device_integer_dot_product_acceleration_properties_khr {
    cl_bool signed_accelerated;
    cl_bool unsigned_accelerated;
    cl_bool mixed_signedness_accelerated;
    cl_bool accumulating_saturating_signed_accelerated;
    cl_bool accumulating_saturating_unsigned_accelerated;
    cl_bool accumulating_saturating_mixed_signedness_accelerated;
} cl_device_integer_dot_product_acceleration_properties_khr;

/* cl_device_integer_dot_product_capabilities_khr */
#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_PACKED_KHR (1 << 0)
#define CL_DEVICE_INTEGER_DOT_PRODUCT_INPUT_4x8BIT_KHR      (1 << 1)

/* cl_device_info */
#define CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR      0x1073
#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR 0x1074
#define CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR 0x1075

/***************************************************************
* cl_khr_external_memory
***************************************************************/
#define cl_khr_external_memory 1
#define CL_KHR_EXTERNAL_MEMORY_EXTENSION_NAME \
    "cl_khr_external_memory"


#define CL_KHR_EXTERNAL_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 1)

typedef cl_uint             cl_external_memory_handle_type_khr;

/* cl_platform_info */
#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044

/* cl_device_info */
#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR   0x204F
#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_ASSUME_LINEAR_IMAGES_HANDLE_TYPES_KHR 0x2052

/* cl_mem_properties */
#define CL_MEM_DEVICE_HANDLE_LIST_KHR                       0x2051
#define CL_MEM_DEVICE_HANDLE_LIST_END_KHR                   0

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR         0x2047
#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR         0x2048


typedef cl_int CL_API_CALL
clEnqueueAcquireExternalMemObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireExternalMemObjectsKHR_t *
clEnqueueAcquireExternalMemObjectsKHR_fn CL_API_SUFFIX__VERSION_3_0;

typedef cl_int CL_API_CALL
clEnqueueReleaseExternalMemObjectsKHR_t(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseExternalMemObjectsKHR_t *
clEnqueueReleaseExternalMemObjectsKHR_fn CL_API_SUFFIX__VERSION_3_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireExternalMemObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_3_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseExternalMemObjectsKHR(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_3_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_external_memory_dma_buf
***************************************************************/
#define cl_khr_external_memory_dma_buf 1
#define CL_KHR_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_NAME \
    "cl_khr_external_memory_dma_buf"


#define CL_KHR_EXTERNAL_MEMORY_DMA_BUF_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_external_memory_handle_type_khr */
#define CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR               0x2067

/***************************************************************
* cl_khr_external_memory_opaque_fd
***************************************************************/
#define cl_khr_external_memory_opaque_fd 1
#define CL_KHR_EXTERNAL_MEMORY_OPAQUE_FD_EXTENSION_NAME \
    "cl_khr_external_memory_opaque_fd"


#define CL_KHR_EXTERNAL_MEMORY_OPAQUE_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_external_memory_handle_type_khr */
#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR             0x2060

/***************************************************************
* cl_khr_external_memory_win32
***************************************************************/
#define cl_khr_external_memory_win32 1
#define CL_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME \
    "cl_khr_external_memory_win32"


#define CL_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_VERSION CL_MAKE_VERSION(1, 1, 0)

/* cl_external_memory_handle_type_khr */
#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR          0x2061
#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR      0x2062
#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_NAME_KHR     0x2069

/***************************************************************
* cl_khr_external_semaphore
***************************************************************/
#define cl_khr_external_semaphore 1
#define CL_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME \
    "cl_khr_external_semaphore"


#define CL_KHR_EXTERNAL_SEMAPHORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 1)

typedef struct _cl_semaphore_khr * cl_semaphore_khr;
typedef cl_uint             cl_external_semaphore_handle_type_khr;

/* cl_platform_info */
#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR       0x2037
#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR       0x2038

/* cl_device_info */
#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR         0x204D
#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR         0x204E

/* cl_semaphore_properties_khr */
#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                0x203F
#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_LIST_END_KHR       0

/* cl_semaphore_info_khr */
#define CL_SEMAPHORE_EXPORTABLE_KHR                         0x2054


typedef cl_int CL_API_CALL
clGetSemaphoreHandleForTypeKHR_t(
    cl_semaphore_khr sema_object,
    cl_device_id device,
    cl_external_semaphore_handle_type_khr handle_type,
    size_t handle_size,
    void* handle_ptr,
    size_t* handle_size_ret);

typedef clGetSemaphoreHandleForTypeKHR_t *
clGetSemaphoreHandleForTypeKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSemaphoreHandleForTypeKHR(
    cl_semaphore_khr sema_object,
    cl_device_id device,
    cl_external_semaphore_handle_type_khr handle_type,
    size_t handle_size,
    void* handle_ptr,
    size_t* handle_size_ret) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_external_semaphore_opaque_fd
***************************************************************/
#define cl_khr_external_semaphore_opaque_fd 1
#define CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXTENSION_NAME \
    "cl_khr_external_semaphore_opaque_fd"


#define CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_external_semaphore_handle_type_khr */
#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR                   0x2055

/***************************************************************
* cl_khr_external_semaphore_sync_fd
***************************************************************/
#define cl_khr_external_semaphore_sync_fd 1
#define CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_NAME \
    "cl_khr_external_semaphore_sync_fd"


#define CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_properties       cl_semaphore_reimport_properties_khr;

/* cl_external_semaphore_handle_type_khr */
#define CL_SEMAPHORE_HANDLE_SYNC_FD_KHR                     0x2058


typedef cl_int CL_API_CALL
clReImportSemaphoreSyncFdKHR_t(
    cl_semaphore_khr sema_object,
    cl_semaphore_reimport_properties_khr* reimport_props,
    int fd);

typedef clReImportSemaphoreSyncFdKHR_t *
clReImportSemaphoreSyncFdKHR_fn CL_API_SUFFIX__VERSION_3_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clReImportSemaphoreSyncFdKHR(
    cl_semaphore_khr sema_object,
    cl_semaphore_reimport_properties_khr* reimport_props,
    int fd) CL_API_SUFFIX__VERSION_3_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_external_semaphore_win32
***************************************************************/
#define cl_khr_external_semaphore_win32 1
#define CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME \
    "cl_khr_external_semaphore_win32"


#define CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 1)

/* cl_external_semaphore_handle_type_khr */
#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR                0x2056
#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR            0x2057
#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_NAME_KHR           0x2068

/***************************************************************
* cl_khr_semaphore
***************************************************************/
#define cl_khr_semaphore 1
#define CL_KHR_SEMAPHORE_EXTENSION_NAME \
    "cl_khr_semaphore"


#define CL_KHR_SEMAPHORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* type cl_semaphore_khr */
typedef cl_properties       cl_semaphore_properties_khr;
typedef cl_uint             cl_semaphore_info_khr;
typedef cl_uint             cl_semaphore_type_khr;
typedef cl_ulong            cl_semaphore_payload_khr;

/* cl_semaphore_type */
#define CL_SEMAPHORE_TYPE_BINARY_KHR                        1

/* cl_platform_info */
#define CL_PLATFORM_SEMAPHORE_TYPES_KHR                     0x2036

/* cl_device_info */
#define CL_DEVICE_SEMAPHORE_TYPES_KHR                       0x204C

/* cl_semaphore_info_khr */
#define CL_SEMAPHORE_CONTEXT_KHR                            0x2039
#define CL_SEMAPHORE_REFERENCE_COUNT_KHR                    0x203A
#define CL_SEMAPHORE_PROPERTIES_KHR                         0x203B
#define CL_SEMAPHORE_PAYLOAD_KHR                            0x203C

/* cl_semaphore_info_khr or cl_semaphore_properties_khr */
#define CL_SEMAPHORE_TYPE_KHR                               0x203D
#define CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR                 0x2053
#define CL_SEMAPHORE_DEVICE_HANDLE_LIST_END_KHR             0

/* cl_command_type */
#define CL_COMMAND_SEMAPHORE_WAIT_KHR                       0x2042
#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR                     0x2043

/* Error codes */
#define CL_INVALID_SEMAPHORE_KHR                            -1142


typedef cl_semaphore_khr CL_API_CALL
clCreateSemaphoreWithPropertiesKHR_t(
    cl_context context,
    const cl_semaphore_properties_khr* sema_props,
    cl_int* errcode_ret);

typedef clCreateSemaphoreWithPropertiesKHR_t *
clCreateSemaphoreWithPropertiesKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueWaitSemaphoresKHR_t(
    cl_command_queue command_queue,
    cl_uint num_sema_objects,
    const cl_semaphore_khr* sema_objects,
    const cl_semaphore_payload_khr* sema_payload_list,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueWaitSemaphoresKHR_t *
clEnqueueWaitSemaphoresKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSignalSemaphoresKHR_t(
    cl_command_queue command_queue,
    cl_uint num_sema_objects,
    const cl_semaphore_khr* sema_objects,
    const cl_semaphore_payload_khr* sema_payload_list,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSignalSemaphoresKHR_t *
clEnqueueSignalSemaphoresKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clGetSemaphoreInfoKHR_t(
    cl_semaphore_khr sema_object,
    cl_semaphore_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetSemaphoreInfoKHR_t *
clGetSemaphoreInfoKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clReleaseSemaphoreKHR_t(
    cl_semaphore_khr sema_object);

typedef clReleaseSemaphoreKHR_t *
clReleaseSemaphoreKHR_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clRetainSemaphoreKHR_t(
    cl_semaphore_khr sema_object);

typedef clRetainSemaphoreKHR_t *
clRetainSemaphoreKHR_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL
clCreateSemaphoreWithPropertiesKHR(
    cl_context context,
    const cl_semaphore_properties_khr* sema_props,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWaitSemaphoresKHR(
    cl_command_queue command_queue,
    cl_uint num_sema_objects,
    const cl_semaphore_khr* sema_objects,
    const cl_semaphore_payload_khr* sema_payload_list,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSignalSemaphoresKHR(
    cl_command_queue command_queue,
    cl_uint num_sema_objects,
    const cl_semaphore_khr* sema_objects,
    const cl_semaphore_payload_khr* sema_payload_list,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSemaphoreInfoKHR(
    cl_semaphore_khr sema_object,
    cl_semaphore_info_khr param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSemaphoreKHR(
    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSemaphoreKHR(
    cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_arm_import_memory
***************************************************************/
#define cl_arm_import_memory 1
#define CL_ARM_IMPORT_MEMORY_EXTENSION_NAME \
    "cl_arm_import_memory"


#define CL_ARM_IMPORT_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef intptr_t            cl_import_properties_arm;

/* cl_import_properties_arm */
#define CL_IMPORT_TYPE_ARM                                  0x40B2
#define CL_IMPORT_TYPE_HOST_ARM                             0x40B3
#define CL_IMPORT_TYPE_DMA_BUF_ARM                          0x40B4
#define CL_IMPORT_TYPE_PROTECTED_ARM                        0x40B5
#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM          0x41E2
#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM    0x41E3
#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM               SIZE_MAX
#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_PLANE_INDEX_ARM   0x41EF
#define CL_IMPORT_ANDROID_HARDWARE_BUFFER_LAYER_INDEX_ARM   0x41F0


typedef cl_mem CL_API_CALL
clImportMemoryARM_t(
    cl_context context,
    cl_mem_flags flags,
    const cl_import_properties_arm* properties,
    void* memory,
    size_t size,
    cl_int* errcode_ret);

typedef clImportMemoryARM_t *
clImportMemoryARM_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clImportMemoryARM(
    cl_context context,
    cl_mem_flags flags,
    const cl_import_properties_arm* properties,
    void* memory,
    size_t size,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_arm_shared_virtual_memory
***************************************************************/
#define cl_arm_shared_virtual_memory 1
#define CL_ARM_SHARED_VIRTUAL_MEMORY_EXTENSION_NAME \
    "cl_arm_shared_virtual_memory"


#define CL_ARM_SHARED_VIRTUAL_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_bitfield         cl_svm_mem_flags_arm;
typedef cl_uint             cl_kernel_exec_info_arm;
typedef cl_bitfield         cl_device_svm_capabilities_arm;

/* cl_device_info */
#define CL_DEVICE_SVM_CAPABILITIES_ARM                      0x40B6

/* cl_mem_info */
#define CL_MEM_USES_SVM_POINTER_ARM                         0x40B7

/* cl_kernel_exec_info_arm */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                    0x40B8
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM       0x40B9

/* cl_command_type */
#define CL_COMMAND_SVM_FREE_ARM                             0x40BA
#define CL_COMMAND_SVM_MEMCPY_ARM                           0x40BB
#define CL_COMMAND_SVM_MEMFILL_ARM                          0x40BC
#define CL_COMMAND_SVM_MAP_ARM                              0x40BD
#define CL_COMMAND_SVM_UNMAP_ARM                            0x40BE

/* cl_device_svm_capabilities_arm */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM               (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM                 (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM                 (1 << 2)
#define CL_DEVICE_SVM_ATOMICS_ARM                           (1 << 3)

/* cl_svm_mem_flags_arm */
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                    (1 << 10)
#define CL_MEM_SVM_ATOMICS_ARM                              (1 << 11)


typedef void* CL_API_CALL
clSVMAllocARM_t(
    cl_context context,
    cl_svm_mem_flags_arm flags,
    size_t size,
    cl_uint alignment);

typedef clSVMAllocARM_t *
clSVMAllocARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef void CL_API_CALL
clSVMFreeARM_t(
    cl_context context,
    void* svm_pointer);

typedef clSVMFreeARM_t *
clSVMFreeARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSVMFreeARM_t(
    cl_command_queue command_queue,
    cl_uint num_svm_pointers,
    void* svm_pointers[],
    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),
    void* user_data,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMFreeARM_t *
clEnqueueSVMFreeARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSVMMemcpyARM_t(
    cl_command_queue command_queue,
    cl_bool blocking_copy,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMemcpyARM_t *
clEnqueueSVMMemcpyARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSVMMemFillARM_t(
    cl_command_queue command_queue,
    void* svm_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMemFillARM_t *
clEnqueueSVMMemFillARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSVMMapARM_t(
    cl_command_queue command_queue,
    cl_bool blocking_map,
    cl_map_flags flags,
    void* svm_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMapARM_t *
clEnqueueSVMMapARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueSVMUnmapARM_t(
    cl_command_queue command_queue,
    void* svm_ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMUnmapARM_t *
clEnqueueSVMUnmapARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clSetKernelArgSVMPointerARM_t(
    cl_kernel kernel,
    cl_uint arg_index,
    const void* arg_value);

typedef clSetKernelArgSVMPointerARM_t *
clSetKernelArgSVMPointerARM_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clSetKernelExecInfoARM_t(
    cl_kernel kernel,
    cl_kernel_exec_info_arm param_name,
    size_t param_value_size,
    const void* param_value);

typedef clSetKernelExecInfoARM_t *
clSetKernelExecInfoARM_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY void* CL_API_CALL
clSVMAllocARM(
    cl_context context,
    cl_svm_mem_flags_arm flags,
    size_t size,
    cl_uint alignment) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY void CL_API_CALL
clSVMFreeARM(
    cl_context context,
    void* svm_pointer) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFreeARM(
    cl_command_queue command_queue,
    cl_uint num_svm_pointers,
    void* svm_pointers[],
    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void *user_data),
    void* user_data,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpyARM(
    cl_command_queue command_queue,
    cl_bool blocking_copy,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFillARM(
    cl_command_queue command_queue,
    void* svm_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMapARM(
    cl_command_queue command_queue,
    cl_bool blocking_map,
    cl_map_flags flags,
    void* svm_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmapARM(
    cl_command_queue command_queue,
    void* svm_ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointerARM(
    cl_kernel kernel,
    cl_uint arg_index,
    const void* arg_value) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfoARM(
    cl_kernel kernel,
    cl_kernel_exec_info_arm param_name,
    size_t param_value_size,
    const void* param_value) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_arm_get_core_id
***************************************************************/
#if defined(CL_VERSION_1_2)

#define cl_arm_get_core_id 1
#define CL_ARM_GET_CORE_ID_EXTENSION_NAME \
    "cl_arm_get_core_id"


#define CL_ARM_GET_CORE_ID_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM                0x40BF

#endif /* defined(CL_VERSION_1_2) */

/***************************************************************
* cl_arm_job_slot_selection
***************************************************************/
#define cl_arm_job_slot_selection 1
#define CL_ARM_JOB_SLOT_SELECTION_EXTENSION_NAME \
    "cl_arm_job_slot_selection"


#define CL_ARM_JOB_SLOT_SELECTION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_JOB_SLOTS_ARM                             0x41E0

/* cl_queue_properties */
#define CL_QUEUE_JOB_SLOT_ARM                               0x41E1

/***************************************************************
* cl_arm_scheduling_controls
***************************************************************/
#define cl_arm_scheduling_controls 1
#define CL_ARM_SCHEDULING_CONTROLS_EXTENSION_NAME \
    "cl_arm_scheduling_controls"


#define CL_ARM_SCHEDULING_CONTROLS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* Types */
typedef cl_bitfield         cl_device_scheduling_controls_capabilities_arm;

/* cl_device_scheduling_controls_capabilities_arm */
#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM            (1 << 0)
#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM       (1 << 1)
#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2)
#define CL_DEVICE_SCHEDULING_DEFERRED_FLUSH_ARM             (1 << 3)
#define CL_DEVICE_SCHEDULING_REGISTER_ALLOCATION_ARM        (1 << 4)
#define CL_DEVICE_SCHEDULING_WARP_THROTTLING_ARM            (1 << 5)
#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_BATCH_QUEUE_SIZE_ARM (1 << 6)
#define CL_DEVICE_SCHEDULING_COMPUTE_UNIT_LIMIT_ARM         (1 << 7)

/* cl_device_info */
#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM      0x41E4
#define CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM        0x41EB
#define CL_DEVICE_MAX_WARP_COUNT_ARM                        0x41EA

/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM        0x41E5
#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6
#define CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM            0x41E8
#define CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM 0x41F1

/* cl_kernel_info */
#define CL_KERNEL_MAX_WARP_COUNT_ARM                        0x41E9

/* cl_queue_properties */
#define CL_QUEUE_KERNEL_BATCHING_ARM                        0x41E7
#define CL_QUEUE_DEFERRED_FLUSH_ARM                         0x41EC
#define CL_QUEUE_COMPUTE_UNIT_LIMIT_ARM                     0x41F3

/***************************************************************
* cl_arm_controlled_kernel_termination
***************************************************************/
#define cl_arm_controlled_kernel_termination 1
#define CL_ARM_CONTROLLED_KERNEL_TERMINATION_EXTENSION_NAME \
    "cl_arm_controlled_kernel_termination"


#define CL_ARM_CONTROLLED_KERNEL_TERMINATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* Types */
typedef cl_bitfield         cl_device_controlled_termination_capabilities_arm;

/* Error codes */
#define CL_COMMAND_TERMINATED_ITSELF_WITH_FAILURE_ARM       -1108

/* cl_device_controlled_termination_capabilities_arm */
#define CL_DEVICE_CONTROLLED_TERMINATION_SUCCESS_ARM        (1 << 0)
#define CL_DEVICE_CONTROLLED_TERMINATION_FAILURE_ARM        (1 << 1)
#define CL_DEVICE_CONTROLLED_TERMINATION_QUERY_ARM          (1 << 2)

/* cl_device_info */
#define CL_DEVICE_CONTROLLED_TERMINATION_CAPABILITIES_ARM   0x41EE

/* cl_event_info */
#define CL_EVENT_COMMAND_TERMINATION_REASON_ARM             0x41ED

/* cl_command_termination_reason_arm */
#define CL_COMMAND_TERMINATION_COMPLETION_ARM               0
#define CL_COMMAND_TERMINATION_CONTROLLED_SUCCESS_ARM       1
#define CL_COMMAND_TERMINATION_CONTROLLED_FAILURE_ARM       2
#define CL_COMMAND_TERMINATION_ERROR_ARM                    3

/***************************************************************
* cl_arm_protected_memory_allocation
***************************************************************/
#define cl_arm_protected_memory_allocation 1
#define CL_ARM_PROTECTED_MEMORY_ALLOCATION_EXTENSION_NAME \
    "cl_arm_protected_memory_allocation"


#define CL_ARM_PROTECTED_MEMORY_ALLOCATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

#define CL_MEM_PROTECTED_ALLOC_ARM                          ((cl_bitfield)1 << 36)

/***************************************************************
* cl_intel_exec_by_local_thread
***************************************************************/
#define cl_intel_exec_by_local_thread 1
#define CL_INTEL_EXEC_BY_LOCAL_THREAD_EXTENSION_NAME \
    "cl_intel_exec_by_local_thread"


#define CL_INTEL_EXEC_BY_LOCAL_THREAD_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL             ((cl_bitfield)1 << 31)

/***************************************************************
* cl_intel_device_attribute_query
***************************************************************/
#define cl_intel_device_attribute_query 1
#define CL_INTEL_DEVICE_ATTRIBUTE_QUERY_EXTENSION_NAME \
    "cl_intel_device_attribute_query"


#define CL_INTEL_DEVICE_ATTRIBUTE_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_bitfield         cl_device_feature_capabilities_intel;

/* cl_device_feature_capabilities_intel */
#define CL_DEVICE_FEATURE_FLAG_DP4A_INTEL                   (1 << 0)
#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL                   (1 << 1)

/* cl_device_info */
#define CL_DEVICE_IP_VERSION_INTEL                          0x4250
#define CL_DEVICE_ID_INTEL                                  0x4251
#define CL_DEVICE_NUM_SLICES_INTEL                          0x4252
#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL            0x4253
#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL               0x4254
#define CL_DEVICE_NUM_THREADS_PER_EU_INTEL                  0x4255
#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL                0x4256

/***************************************************************
* cl_intel_device_partition_by_names
***************************************************************/
#define cl_intel_device_partition_by_names 1
#define CL_INTEL_DEVICE_PARTITION_BY_NAMES_EXTENSION_NAME \
    "cl_intel_device_partition_by_names"


#define CL_INTEL_DEVICE_PARTITION_BY_NAMES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

#define CL_DEVICE_PARTITION_BY_NAMES_INTEL                  0x4052
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL                -1

/***************************************************************
* cl_intel_accelerator
***************************************************************/
#define cl_intel_accelerator 1
#define CL_INTEL_ACCELERATOR_EXTENSION_NAME \
    "cl_intel_accelerator"


#define CL_INTEL_ACCELERATOR_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef struct _cl_accelerator_intel* cl_accelerator_intel;
typedef cl_uint             cl_accelerator_type_intel;
typedef cl_uint             cl_accelerator_info_intel;

/* cl_accelerator_info_intel */
#define CL_ACCELERATOR_DESCRIPTOR_INTEL                     0x4090
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                0x4091
#define CL_ACCELERATOR_CONTEXT_INTEL                        0x4092
#define CL_ACCELERATOR_TYPE_INTEL                           0x4093

/* Error codes */
#define CL_INVALID_ACCELERATOR_INTEL                        -1094
#define CL_INVALID_ACCELERATOR_TYPE_INTEL                   -1095
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL             -1096
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL             -1097


typedef cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL_t(
    cl_context context,
    cl_accelerator_type_intel accelerator_type,
    size_t descriptor_size,
    const void* descriptor,
    cl_int* errcode_ret);

typedef clCreateAcceleratorINTEL_t *
clCreateAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clGetAcceleratorInfoINTEL_t(
    cl_accelerator_intel accelerator,
    cl_accelerator_info_intel param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetAcceleratorInfoINTEL_t *
clGetAcceleratorInfoINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clRetainAcceleratorINTEL_t(
    cl_accelerator_intel accelerator);

typedef clRetainAcceleratorINTEL_t *
clRetainAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clReleaseAcceleratorINTEL_t(
    cl_accelerator_intel accelerator);

typedef clReleaseAcceleratorINTEL_t *
clReleaseAcceleratorINTEL_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL(
    cl_context context,
    cl_accelerator_type_intel accelerator_type,
    size_t descriptor_size,
    const void* descriptor,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetAcceleratorInfoINTEL(
    cl_accelerator_intel accelerator,
    cl_accelerator_info_intel param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clRetainAcceleratorINTEL(
    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseAcceleratorINTEL(
    cl_accelerator_intel accelerator) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_motion_estimation
***************************************************************/
#define cl_intel_motion_estimation 1
#define CL_INTEL_MOTION_ESTIMATION_EXTENSION_NAME \
    "cl_intel_motion_estimation"


#define CL_INTEL_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef struct _cl_motion_estimation_desc_intel {
    cl_uint mb_block_type;
    cl_uint subpixel_mode;
    cl_uint sad_adjust_mode;
    cl_uint search_path_type;
} cl_motion_estimation_desc_intel;

/* cl_accelerator_type_intel */
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL         0x0

/* cl_uint mb_block_type */
#define CL_ME_MB_TYPE_16x16_INTEL                           0x0
#define CL_ME_MB_TYPE_8x8_INTEL                             0x1
#define CL_ME_MB_TYPE_4x4_INTEL                             0x2

/* cl_uint subpixel_mode */
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                   0x0
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                      0x1
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                      0x2

/* cl_uint sad_adjust_mode */
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                    0x0
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                    0x1

/* cl_uint search_path_type */
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                  0x0
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                  0x1
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                0x5

/***************************************************************
* cl_intel_advanced_motion_estimation
***************************************************************/
#define cl_intel_advanced_motion_estimation 1
#define CL_INTEL_ADVANCED_MOTION_ESTIMATION_EXTENSION_NAME \
    "cl_intel_advanced_motion_estimation"


#define CL_INTEL_ADVANCED_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_ME_VERSION_INTEL                          0x407E

#define CL_ME_VERSION_LEGACY_INTEL                          0x0
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                  0x1
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                  0x2

#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL            0x1
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL              0x2

#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                   0x0
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                     0x4

#define CL_ME_COST_PENALTY_NONE_INTEL                       0x0
#define CL_ME_COST_PENALTY_LOW_INTEL                        0x1
#define CL_ME_COST_PENALTY_NORMAL_INTEL                     0x2
#define CL_ME_COST_PENALTY_HIGH_INTEL                       0x3

#define CL_ME_COST_PRECISION_QPEL_INTEL                     0x0
#define CL_ME_COST_PRECISION_HPEL_INTEL                     0x1
#define CL_ME_COST_PRECISION_PEL_INTEL                      0x2
#define CL_ME_COST_PRECISION_DPEL_INTEL                     0x3

#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8

#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3

#define CL_ME_FORWARD_INPUT_MODE_INTEL                      0x1
#define CL_ME_BACKWARD_INPUT_MODE_INTEL                     0x2
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                  0x3

#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                    16
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                      21
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                       32
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                  43
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL              48

/***************************************************************
* cl_intel_simultaneous_sharing
***************************************************************/
#define cl_intel_simultaneous_sharing 1
#define CL_INTEL_SIMULTANEOUS_SHARING_EXTENSION_NAME \
    "cl_intel_simultaneous_sharing"


#define CL_INTEL_SIMULTANEOUS_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL               0x4104
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL           0x4105

/***************************************************************
* cl_intel_egl_image_yuv
***************************************************************/
#define cl_intel_egl_image_yuv 1
#define CL_INTEL_EGL_IMAGE_YUV_EXTENSION_NAME \
    "cl_intel_egl_image_yuv"


#define CL_INTEL_EGL_IMAGE_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_egl_image_properties_khr */
#define CL_EGL_YUV_PLANE_INTEL                              0x4107

/***************************************************************
* cl_intel_packed_yuv
***************************************************************/
#define cl_intel_packed_yuv 1
#define CL_INTEL_PACKED_YUV_EXTENSION_NAME \
    "cl_intel_packed_yuv"


#define CL_INTEL_PACKED_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_channel_order */
#define CL_YUYV_INTEL                                       0x4076
#define CL_UYVY_INTEL                                       0x4077
#define CL_YVYU_INTEL                                       0x4078
#define CL_VYUY_INTEL                                       0x4079

/***************************************************************
* cl_intel_required_subgroup_size
***************************************************************/
#define cl_intel_required_subgroup_size 1
#define CL_INTEL_REQUIRED_SUBGROUP_SIZE_EXTENSION_NAME \
    "cl_intel_required_subgroup_size"


#define CL_INTEL_REQUIRED_SUBGROUP_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                     0x4108

/* cl_kernel_work_group_info */
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                      0x4109

/* cl_kernel_sub_group_info */
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL              0x410A

/***************************************************************
* cl_intel_driver_diagnostics
***************************************************************/
#define cl_intel_driver_diagnostics 1
#define CL_INTEL_DRIVER_DIAGNOSTICS_EXTENSION_NAME \
    "cl_intel_driver_diagnostics"


#define CL_INTEL_DRIVER_DIAGNOSTICS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_bitfield         cl_diagnostic_verbose_level_intel;

/* cl_context_properties */
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                   0x4106

/* cl_diagnostic_verbose_level_intel */
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL              0xff
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL             (1 << 0)
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL              (1 << 1)
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL          (1 << 2)

/***************************************************************
* cl_intel_planar_yuv
***************************************************************/
#define cl_intel_planar_yuv 1
#define CL_INTEL_PLANAR_YUV_EXTENSION_NAME \
    "cl_intel_planar_yuv"


#define CL_INTEL_PLANAR_YUV_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_channel_order */
#define CL_NV12_INTEL                                       0x410E

/* cl_mem_flags */
#define CL_MEM_NO_ACCESS_INTEL                              (1 << 24)
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              (1 << 25)

/* cl_device_info */
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F

/***************************************************************
* cl_intel_device_side_avc_motion_estimation
***************************************************************/
#define cl_intel_device_side_avc_motion_estimation 1
#define CL_INTEL_DEVICE_SIDE_AVC_MOTION_ESTIMATION_EXTENSION_NAME \
    "cl_intel_device_side_avc_motion_estimation"


#define CL_INTEL_DEVICE_SIDE_AVC_MOTION_ESTIMATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_device_info */
#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D

/* returned by CL_DEVICE_AVC_ME_VERSION_INTEL */
#define CL_AVC_ME_VERSION_0_INTEL                           0x0
#define CL_AVC_ME_VERSION_1_INTEL                           0x1

/* Inter macro-block major shape values */
#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3

/* Inter macro-block minor shape values */
#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3

/* Inter macro-block major direction values */
#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2

/* Inter (IME) partition mask values */
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F

/* Search window configuration */
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa

/* SAD adjustment mode */
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2

/* Pixel resolution */
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3

/* Cost precision values */
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3

/* Inter bidirectional weights */
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30

/* Inter border reached values */
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8

/* Inter skip block partition type */
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000

/* Inter skip motion vector mask */
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     (0x1 << 24)
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    (0x2 << 24)
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        (0x3 << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       (0x55 << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      (0xAA << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          (0xFF << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     (0x1 << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    (0x2 << 24)
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     (0x1 << 26)
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    (0x2 << 26)
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     (0x1 << 28)
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    (0x2 << 28)
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     (0x1 << 30)
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    (0x2 << 30)

/* Block based skip type values */
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80

/* cl_intel_device_side_avc_motion_estimation.?? */
#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2

/* Luma intra partition mask values */
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3

/* Intra neighbor availability mask values */
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL     0x60
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL    0x10
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4

/* Luma intra modes */
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL        0x0
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL      0x1
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL              0x2
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL           0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL  0x5
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL   0x7
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL   0x8

/* Chroma intra modes */
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL            0x0
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL    0x1
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL      0x2
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL         0x3

/* Reference image select values */
#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3

/* Slice type values */
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2

/* Interlaced image field polarity values */
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1

/***************************************************************
* cl_intel_unified_shared_memory
***************************************************************/
#define cl_intel_unified_shared_memory 1
#define CL_INTEL_UNIFIED_SHARED_MEMORY_EXTENSION_NAME \
    "cl_intel_unified_shared_memory"


#define CL_INTEL_UNIFIED_SHARED_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_bitfield         cl_device_unified_shared_memory_capabilities_intel;
typedef cl_properties       cl_mem_properties_intel;
typedef cl_bitfield         cl_mem_alloc_flags_intel;
typedef cl_uint             cl_mem_info_intel;
typedef cl_uint             cl_unified_shared_memory_type_intel;
typedef cl_uint             cl_mem_advice_intel;

/* cl_device_info */
#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL               0x4190
#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL             0x4191
#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192
#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193
#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL      0x4194

/* cl_unified_shared_memory_capabilities_intel - bitfield */
#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL               (1 << 0)
#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL        (1 << 1)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL    (1 << 2)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)

/* cl_mem_properties_intel */
#define CL_MEM_ALLOC_FLAGS_INTEL                            0x4195

/* cl_mem_alloc_flags_intel - bitfield */
#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL                   (1 << 0)
#define CL_MEM_ALLOC_INITIAL_PLACEMENT_DEVICE_INTEL         (1 << 1)
#define CL_MEM_ALLOC_INITIAL_PLACEMENT_HOST_INTEL           (1 << 2)

/* cl_mem_alloc_info_intel */
#define CL_MEM_ALLOC_TYPE_INTEL                             0x419A
#define CL_MEM_ALLOC_BASE_PTR_INTEL                         0x419B
#define CL_MEM_ALLOC_SIZE_INTEL                             0x419C
#define CL_MEM_ALLOC_DEVICE_INTEL                           0x419D

/* cl_unified_shared_memory_type_intel */
#define CL_MEM_TYPE_UNKNOWN_INTEL                           0x4196
#define CL_MEM_TYPE_HOST_INTEL                              0x4197
#define CL_MEM_TYPE_DEVICE_INTEL                            0x4198
#define CL_MEM_TYPE_SHARED_INTEL                            0x4199

/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL      0x4200
#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL    0x4201
#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL    0x4202
#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL                  0x4203

/* cl_command_type */
#define CL_COMMAND_MEMFILL_INTEL                            0x4204
#define CL_COMMAND_MEMCPY_INTEL                             0x4205
#define CL_COMMAND_MIGRATEMEM_INTEL                         0x4206
#define CL_COMMAND_MEMADVISE_INTEL                          0x4207


typedef void* CL_API_CALL
clHostMemAllocINTEL_t(
    cl_context context,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret);

typedef clHostMemAllocINTEL_t *
clHostMemAllocINTEL_fn ;

typedef void* CL_API_CALL
clDeviceMemAllocINTEL_t(
    cl_context context,
    cl_device_id device,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret);

typedef clDeviceMemAllocINTEL_t *
clDeviceMemAllocINTEL_fn ;

typedef void* CL_API_CALL
clSharedMemAllocINTEL_t(
    cl_context context,
    cl_device_id device,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret);

typedef clSharedMemAllocINTEL_t *
clSharedMemAllocINTEL_fn ;

typedef cl_int CL_API_CALL
clMemFreeINTEL_t(
    cl_context context,
    void* ptr);

typedef clMemFreeINTEL_t *
clMemFreeINTEL_fn ;

typedef cl_int CL_API_CALL
clMemBlockingFreeINTEL_t(
    cl_context context,
    void* ptr);

typedef clMemBlockingFreeINTEL_t *
clMemBlockingFreeINTEL_fn ;

typedef cl_int CL_API_CALL
clGetMemAllocInfoINTEL_t(
    cl_context context,
    const void* ptr,
    cl_mem_info_intel param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetMemAllocInfoINTEL_t *
clGetMemAllocInfoINTEL_fn ;

typedef cl_int CL_API_CALL
clSetKernelArgMemPointerINTEL_t(
    cl_kernel kernel,
    cl_uint arg_index,
    const void* arg_value);

typedef clSetKernelArgMemPointerINTEL_t *
clSetKernelArgMemPointerINTEL_fn ;

typedef cl_int CL_API_CALL
clEnqueueMemFillINTEL_t(
    cl_command_queue command_queue,
    void* dst_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMemFillINTEL_t *
clEnqueueMemFillINTEL_fn ;

typedef cl_int CL_API_CALL
clEnqueueMemcpyINTEL_t(
    cl_command_queue command_queue,
    cl_bool blocking,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMemcpyINTEL_t *
clEnqueueMemcpyINTEL_fn ;

typedef cl_int CL_API_CALL
clEnqueueMemAdviseINTEL_t(
    cl_command_queue command_queue,
    const void* ptr,
    size_t size,
    cl_mem_advice_intel advice,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMemAdviseINTEL_t *
clEnqueueMemAdviseINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY void* CL_API_CALL
clHostMemAllocINTEL(
    cl_context context,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret) ;

extern CL_API_ENTRY void* CL_API_CALL
clDeviceMemAllocINTEL(
    cl_context context,
    cl_device_id device,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret) ;

extern CL_API_ENTRY void* CL_API_CALL
clSharedMemAllocINTEL(
    cl_context context,
    cl_device_id device,
    const cl_mem_properties_intel* properties,
    size_t size,
    cl_uint alignment,
    cl_int* errcode_ret) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clMemFreeINTEL(
    cl_context context,
    void* ptr) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clMemBlockingFreeINTEL(
    cl_context context,
    void* ptr) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemAllocInfoINTEL(
    cl_context context,
    const void* ptr,
    cl_mem_info_intel param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgMemPointerINTEL(
    cl_kernel kernel,
    cl_uint arg_index,
    const void* arg_value) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemFillINTEL(
    cl_command_queue command_queue,
    void* dst_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemcpyINTEL(
    cl_command_queue command_queue,
    cl_bool blocking,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemAdviseINTEL(
    cl_command_queue command_queue,
    const void* ptr,
    size_t size,
    cl_mem_advice_intel advice,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#if defined(CL_VERSION_1_2)
/* Requires OpenCL 1.2 for cl_mem_migration_flags: */

typedef cl_int CL_API_CALL
clEnqueueMigrateMemINTEL_t(
    cl_command_queue command_queue,
    const void* ptr,
    size_t size,
    cl_mem_migration_flags flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMigrateMemINTEL_t *
clEnqueueMigrateMemINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemINTEL(
    cl_command_queue command_queue,
    const void* ptr,
    size_t size,
    cl_mem_migration_flags flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#endif /* defined(CL_VERSION_1_2) */

/* deprecated, use clEnqueueMemFillINTEL instead */

typedef cl_int CL_API_CALL
clEnqueueMemsetINTEL_t(
    cl_command_queue command_queue,
    void* dst_ptr,
    cl_int value,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMemsetINTEL_t *
clEnqueueMemsetINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemsetINTEL(
    cl_command_queue command_queue,
    void* dst_ptr,
    cl_int value,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_mem_alloc_buffer_location
***************************************************************/
#define cl_intel_mem_alloc_buffer_location 1
#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_NAME \
    "cl_intel_mem_alloc_buffer_location"


#define CL_INTEL_MEM_ALLOC_BUFFER_LOCATION_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_properties_intel */
#define CL_MEM_ALLOC_BUFFER_LOCATION_INTEL                  0x419E

/* cl_mem_alloc_info_intel */
/* enum CL_MEM_ALLOC_BUFFER_LOCATION_INTEL */

/***************************************************************
* cl_intel_create_buffer_with_properties
***************************************************************/
#define cl_intel_create_buffer_with_properties 1
#define CL_INTEL_CREATE_BUFFER_WITH_PROPERTIES_EXTENSION_NAME \
    "cl_intel_create_buffer_with_properties"


#define CL_INTEL_CREATE_BUFFER_WITH_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* type cl_mem_properties_intel */


typedef cl_mem CL_API_CALL
clCreateBufferWithPropertiesINTEL_t(
    cl_context context,
    const cl_mem_properties_intel* properties,
    cl_mem_flags flags,
    size_t size,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateBufferWithPropertiesINTEL_t *
clCreateBufferWithPropertiesINTEL_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferWithPropertiesINTEL(
    cl_context context,
    const cl_mem_properties_intel* properties,
    cl_mem_flags flags,
    size_t size,
    void* host_ptr,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_program_scope_host_pipe
***************************************************************/
#define cl_intel_program_scope_host_pipe 1
#define CL_INTEL_PROGRAM_SCOPE_HOST_PIPE_EXTENSION_NAME \
    "cl_intel_program_scope_host_pipe"


#define CL_INTEL_PROGRAM_SCOPE_HOST_PIPE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* clGetEventInfo response when param_name is CL_EVENT_COMMAND_TYPE */
#define CL_COMMAND_READ_HOST_PIPE_INTEL                     0x4214
#define CL_COMMAND_WRITE_HOST_PIPE_INTEL                    0x4215

/* clGetProgramInfo param_name */
#define CL_PROGRAM_NUM_HOST_PIPES_INTEL                     0x4216
#define CL_PROGRAM_HOST_PIPE_NAMES_INTEL                    0x4217


typedef cl_int CL_API_CALL
clEnqueueReadHostPipeINTEL_t(
    cl_command_queue command_queue,
    cl_program program,
    const char* pipe_symbol,
    cl_bool blocking_read,
    void* ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReadHostPipeINTEL_t *
clEnqueueReadHostPipeINTEL_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueWriteHostPipeINTEL_t(
    cl_command_queue command_queue,
    cl_program program,
    const char* pipe_symbol,
    cl_bool blocking_write,
    const void* ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueWriteHostPipeINTEL_t *
clEnqueueWriteHostPipeINTEL_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadHostPipeINTEL(
    cl_command_queue command_queue,
    cl_program program,
    const char* pipe_symbol,
    cl_bool blocking_read,
    void* ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteHostPipeINTEL(
    cl_command_queue command_queue,
    cl_program program,
    const char* pipe_symbol,
    cl_bool blocking_write,
    const void* ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_mem_channel_property
***************************************************************/
#define cl_intel_mem_channel_property 1
#define CL_INTEL_MEM_CHANNEL_PROPERTY_EXTENSION_NAME \
    "cl_intel_mem_channel_property"


#define CL_INTEL_MEM_CHANNEL_PROPERTY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_properties_intel */
#define CL_MEM_CHANNEL_INTEL                                0x4213

/***************************************************************
* cl_intel_mem_force_host_memory
***************************************************************/
#define cl_intel_mem_force_host_memory 1
#define CL_INTEL_MEM_FORCE_HOST_MEMORY_EXTENSION_NAME \
    "cl_intel_mem_force_host_memory"


#define CL_INTEL_MEM_FORCE_HOST_MEMORY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_flags */
#define CL_MEM_FORCE_HOST_MEMORY_INTEL                      (1 << 20)

/***************************************************************
* cl_intel_command_queue_families
***************************************************************/
#define cl_intel_command_queue_families 1
#define CL_INTEL_COMMAND_QUEUE_FAMILIES_EXTENSION_NAME \
    "cl_intel_command_queue_families"


#define CL_INTEL_COMMAND_QUEUE_FAMILIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_bitfield         cl_command_queue_capabilities_intel;

#define CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL                 64

typedef struct _cl_queue_family_properties_intel {
    cl_command_queue_properties properties;
    cl_command_queue_capabilities_intel capabilities;
    cl_uint count;
    char name[CL_QUEUE_FAMILY_MAX_NAME_SIZE_INTEL];
} cl_queue_family_properties_intel;

/* cl_device_info */
#define CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL             0x418B

/* cl_queue_properties */
#define CL_QUEUE_FAMILY_INTEL                               0x418C
#define CL_QUEUE_INDEX_INTEL                                0x418D

/* cl_command_queue_capabilities_intel */
#define CL_QUEUE_DEFAULT_CAPABILITIES_INTEL                 0
#define CL_QUEUE_CAPABILITY_CREATE_SINGLE_QUEUE_EVENTS_INTEL (1 << 0)
#define CL_QUEUE_CAPABILITY_CREATE_CROSS_QUEUE_EVENTS_INTEL (1 << 1)
#define CL_QUEUE_CAPABILITY_SINGLE_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 2)
#define CL_QUEUE_CAPABILITY_CROSS_QUEUE_EVENT_WAIT_LIST_INTEL (1 << 3)
#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_INTEL           (1 << 8)
#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_RECT_INTEL      (1 << 9)
#define CL_QUEUE_CAPABILITY_MAP_BUFFER_INTEL                (1 << 10)
#define CL_QUEUE_CAPABILITY_FILL_BUFFER_INTEL               (1 << 11)
#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_INTEL            (1 << 12)
#define CL_QUEUE_CAPABILITY_MAP_IMAGE_INTEL                 (1 << 13)
#define CL_QUEUE_CAPABILITY_FILL_IMAGE_INTEL                (1 << 14)
#define CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_IMAGE_INTEL     (1 << 15)
#define CL_QUEUE_CAPABILITY_TRANSFER_IMAGE_BUFFER_INTEL     (1 << 16)
#define CL_QUEUE_CAPABILITY_MARKER_INTEL                    (1 << 24)
#define CL_QUEUE_CAPABILITY_BARRIER_INTEL                   (1 << 25)
#define CL_QUEUE_CAPABILITY_KERNEL_INTEL                    (1 << 26)

/***************************************************************
* cl_intel_queue_no_sync_operations
***************************************************************/
#define cl_intel_queue_no_sync_operations 1
#define CL_INTEL_QUEUE_NO_SYNC_OPERATIONS_EXTENSION_NAME \
    "cl_intel_queue_no_sync_operations"


#define CL_INTEL_QUEUE_NO_SYNC_OPERATIONS_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_command_queue_properties */
#define CL_QUEUE_NO_SYNC_OPERATIONS_INTEL                   (1 << 29)

/***************************************************************
* cl_intel_sharing_format_query
***************************************************************/
#define cl_intel_sharing_format_query 1
#define CL_INTEL_SHARING_FORMAT_QUERY_EXTENSION_NAME \
    "cl_intel_sharing_format_query"


#define CL_INTEL_SHARING_FORMAT_QUERY_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/***************************************************************
* cl_ext_image_requirements_info
***************************************************************/
#if defined(CL_VERSION_3_0)

#define cl_ext_image_requirements_info 1
#define CL_EXT_IMAGE_REQUIREMENTS_INFO_EXTENSION_NAME \
    "cl_ext_image_requirements_info"


#define CL_EXT_IMAGE_REQUIREMENTS_INFO_EXTENSION_VERSION CL_MAKE_VERSION(0, 5, 0)

/* Types */
typedef cl_uint             cl_image_requirements_info_ext;

/* cl_image_requirements_info_ext */
#define CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT    0x1292
#define CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT       0x1290
#define CL_IMAGE_REQUIREMENTS_SIZE_EXT                      0x12B2
#define CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT                 0x12B3
#define CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT                0x12B4
#define CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT                 0x12B5
#define CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT            0x12B6

/* Enqueued Commands APIs */

typedef cl_int CL_API_CALL
clGetImageRequirementsInfoEXT_t(
    cl_context context,
    const cl_mem_properties* properties,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    const cl_image_desc* image_desc,
    cl_image_requirements_info_ext param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetImageRequirementsInfoEXT_t *
clGetImageRequirementsInfoEXT_fn CL_API_SUFFIX__VERSION_3_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageRequirementsInfoEXT(
    cl_context context,
    const cl_mem_properties* properties,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    const cl_image_desc* image_desc,
    cl_image_requirements_info_ext param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_3_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#endif /* defined(CL_VERSION_3_0) */

/***************************************************************
* cl_ext_image_from_buffer
***************************************************************/
#if defined(CL_VERSION_3_0)

#define cl_ext_image_from_buffer 1
#define CL_EXT_IMAGE_FROM_BUFFER_EXTENSION_NAME \
    "cl_ext_image_from_buffer"


#define CL_EXT_IMAGE_FROM_BUFFER_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_image_requirements_info_ext */
#define CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT     0x1291

#endif /* defined(CL_VERSION_3_0) */

/***************************************************************
* cl_loader_info
***************************************************************/
#define cl_loader_info 1
#define CL_LOADER_INFO_EXTENSION_NAME \
    "cl_loader_info"


#define CL_LOADER_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_uint             cl_icdl_info;

/* cl_icdl_info */
#define CL_ICDL_OCL_VERSION                                 1
#define CL_ICDL_VERSION                                     2
#define CL_ICDL_NAME                                        3
#define CL_ICDL_VENDOR                                      4


typedef cl_int CL_API_CALL
clGetICDLoaderInfoOCLICD_t(
    cl_icdl_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetICDLoaderInfoOCLICD_t *
clGetICDLoaderInfoOCLICD_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetICDLoaderInfoOCLICD(
    cl_icdl_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_depth_images
***************************************************************/
#define cl_khr_depth_images 1
#define CL_KHR_DEPTH_IMAGES_EXTENSION_NAME \
    "cl_khr_depth_images"


#define CL_KHR_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

#if !defined(CL_VERSION_2_0)
/* cl_channel_order - defined in CL.h for OpenCL 2.0 and newer */
#define CL_DEPTH                                            0x10BD

#endif /* !defined(CL_VERSION_2_0) */

/***************************************************************
* cl_ext_float_atomics
***************************************************************/
#define cl_ext_float_atomics 1
#define CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME \
    "cl_ext_float_atomics"


#define CL_EXT_FLOAT_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_bitfield         cl_device_fp_atomic_capabilities_ext;

/* cl_device_fp_atomic_capabilities_ext */
#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT           (1 << 0)
#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT                  (1 << 1)
#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT              (1 << 2)
#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT            (1 << 16)
#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT                   (1 << 17)
#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT               (1 << 18)

/* cl_device_info */
#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT         0x4231
#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT         0x4232
#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT           0x4233

/***************************************************************
* cl_intel_create_mem_object_properties
***************************************************************/
#define cl_intel_create_mem_object_properties 1
#define CL_INTEL_CREATE_MEM_OBJECT_PROPERTIES_EXTENSION_NAME \
    "cl_intel_create_mem_object_properties"


#define CL_INTEL_CREATE_MEM_OBJECT_PROPERTIES_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* cl_mem_properties */
#define CL_MEM_LOCALLY_UNCACHED_RESOURCE_INTEL              0x4218
#define CL_MEM_DEVICE_ID_INTEL                              0x4219

/***************************************************************
* cl_pocl_content_size
***************************************************************/
#define cl_pocl_content_size 1
#define CL_POCL_CONTENT_SIZE_EXTENSION_NAME \
    "cl_pocl_content_size"


#define CL_POCL_CONTENT_SIZE_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)


typedef cl_int CL_API_CALL
clSetContentSizeBufferPoCL_t(
    cl_mem buffer,
    cl_mem content_size_buffer);

typedef clSetContentSizeBufferPoCL_t *
clSetContentSizeBufferPoCL_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clSetContentSizeBufferPoCL(
    cl_mem buffer,
    cl_mem content_size_buffer) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_ext_image_raw10_raw12
***************************************************************/
#define cl_ext_image_raw10_raw12 1
#define CL_EXT_IMAGE_RAW10_RAW12_EXTENSION_NAME \
    "cl_ext_image_raw10_raw12"


#define CL_EXT_IMAGE_RAW10_RAW12_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_channel_type */
#define CL_UNSIGNED_INT_RAW10_EXT                           0x10E3
#define CL_UNSIGNED_INT_RAW12_EXT                           0x10E4

/***************************************************************
* cl_khr_3d_image_writes
***************************************************************/
#define cl_khr_3d_image_writes 1
#define CL_KHR_3D_IMAGE_WRITES_EXTENSION_NAME \
    "cl_khr_3d_image_writes"


#define CL_KHR_3D_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_async_work_group_copy_fence
***************************************************************/
#define cl_khr_async_work_group_copy_fence 1
#define CL_KHR_ASYNC_WORK_GROUP_COPY_FENCE_EXTENSION_NAME \
    "cl_khr_async_work_group_copy_fence"


#define CL_KHR_ASYNC_WORK_GROUP_COPY_FENCE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_byte_addressable_store
***************************************************************/
#define cl_khr_byte_addressable_store 1
#define CL_KHR_BYTE_ADDRESSABLE_STORE_EXTENSION_NAME \
    "cl_khr_byte_addressable_store"


#define CL_KHR_BYTE_ADDRESSABLE_STORE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_device_enqueue_local_arg_types
***************************************************************/
#define cl_khr_device_enqueue_local_arg_types 1
#define CL_KHR_DEVICE_ENQUEUE_LOCAL_ARG_TYPES_EXTENSION_NAME \
    "cl_khr_device_enqueue_local_arg_types"


#define CL_KHR_DEVICE_ENQUEUE_LOCAL_ARG_TYPES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_expect_assume
***************************************************************/
#define cl_khr_expect_assume 1
#define CL_KHR_EXPECT_ASSUME_EXTENSION_NAME \
    "cl_khr_expect_assume"


#define CL_KHR_EXPECT_ASSUME_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_extended_async_copies
***************************************************************/
#define cl_khr_extended_async_copies 1
#define CL_KHR_EXTENDED_ASYNC_COPIES_EXTENSION_NAME \
    "cl_khr_extended_async_copies"


#define CL_KHR_EXTENDED_ASYNC_COPIES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_extended_bit_ops
***************************************************************/
#define cl_khr_extended_bit_ops 1
#define CL_KHR_EXTENDED_BIT_OPS_EXTENSION_NAME \
    "cl_khr_extended_bit_ops"


#define CL_KHR_EXTENDED_BIT_OPS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_global_int32_base_atomics
***************************************************************/
#define cl_khr_global_int32_base_atomics 1
#define CL_KHR_GLOBAL_INT32_BASE_ATOMICS_EXTENSION_NAME \
    "cl_khr_global_int32_base_atomics"


#define CL_KHR_GLOBAL_INT32_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_global_int32_extended_atomics
***************************************************************/
#define cl_khr_global_int32_extended_atomics 1
#define CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS_EXTENSION_NAME \
    "cl_khr_global_int32_extended_atomics"


#define CL_KHR_GLOBAL_INT32_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_int64_base_atomics
***************************************************************/
#define cl_khr_int64_base_atomics 1
#define CL_KHR_INT64_BASE_ATOMICS_EXTENSION_NAME \
    "cl_khr_int64_base_atomics"


#define CL_KHR_INT64_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_int64_extended_atomics
***************************************************************/
#define cl_khr_int64_extended_atomics 1
#define CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_NAME \
    "cl_khr_int64_extended_atomics"


#define CL_KHR_INT64_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_kernel_clock
***************************************************************/
#define cl_khr_kernel_clock 1
#define CL_KHR_KERNEL_CLOCK_EXTENSION_NAME \
    "cl_khr_kernel_clock"


#define CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION CL_MAKE_VERSION(0, 9, 0)

/* cl_device_info */
#define CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR             0x1076

typedef cl_bitfield         cl_device_kernel_clock_capabilities_khr;

/* cl_device_kernel_clock_capabilities_khr */
#define CL_DEVICE_KERNEL_CLOCK_SCOPE_DEVICE_KHR             (1 << 0)
#define CL_DEVICE_KERNEL_CLOCK_SCOPE_WORK_GROUP_KHR         (1 << 1)
#define CL_DEVICE_KERNEL_CLOCK_SCOPE_SUB_GROUP_KHR          (1 << 2)

/***************************************************************
* cl_khr_local_int32_base_atomics
***************************************************************/
#define cl_khr_local_int32_base_atomics 1
#define CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_NAME \
    "cl_khr_local_int32_base_atomics"


#define CL_KHR_LOCAL_INT32_BASE_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_local_int32_extended_atomics
***************************************************************/
#define cl_khr_local_int32_extended_atomics 1
#define CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS_EXTENSION_NAME \
    "cl_khr_local_int32_extended_atomics"


#define CL_KHR_LOCAL_INT32_EXTENDED_ATOMICS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_mipmap_image_writes
***************************************************************/
#define cl_khr_mipmap_image_writes 1
#define CL_KHR_MIPMAP_IMAGE_WRITES_EXTENSION_NAME \
    "cl_khr_mipmap_image_writes"


#define CL_KHR_MIPMAP_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_select_fprounding_mode
***************************************************************/
#define cl_khr_select_fprounding_mode 1
#define CL_KHR_SELECT_FPROUNDING_MODE_EXTENSION_NAME \
    "cl_khr_select_fprounding_mode"


#define CL_KHR_SELECT_FPROUNDING_MODE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_spirv_extended_debug_info
***************************************************************/
#define cl_khr_spirv_extended_debug_info 1
#define CL_KHR_SPIRV_EXTENDED_DEBUG_INFO_EXTENSION_NAME \
    "cl_khr_spirv_extended_debug_info"


#define CL_KHR_SPIRV_EXTENDED_DEBUG_INFO_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_spirv_linkonce_odr
***************************************************************/
#define cl_khr_spirv_linkonce_odr 1
#define CL_KHR_SPIRV_LINKONCE_ODR_EXTENSION_NAME \
    "cl_khr_spirv_linkonce_odr"


#define CL_KHR_SPIRV_LINKONCE_ODR_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_spirv_no_integer_wrap_decoration
***************************************************************/
#define cl_khr_spirv_no_integer_wrap_decoration 1
#define CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_NAME \
    "cl_khr_spirv_no_integer_wrap_decoration"


#define CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_srgb_image_writes
***************************************************************/
#define cl_khr_srgb_image_writes 1
#define CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_NAME \
    "cl_khr_srgb_image_writes"


#define CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_ballot
***************************************************************/
#define cl_khr_subgroup_ballot 1
#define CL_KHR_SUBGROUP_BALLOT_EXTENSION_NAME \
    "cl_khr_subgroup_ballot"


#define CL_KHR_SUBGROUP_BALLOT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_clustered_reduce
***************************************************************/
#define cl_khr_subgroup_clustered_reduce 1
#define CL_KHR_SUBGROUP_CLUSTERED_REDUCE_EXTENSION_NAME \
    "cl_khr_subgroup_clustered_reduce"


#define CL_KHR_SUBGROUP_CLUSTERED_REDUCE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_extended_types
***************************************************************/
#define cl_khr_subgroup_extended_types 1
#define CL_KHR_SUBGROUP_EXTENDED_TYPES_EXTENSION_NAME \
    "cl_khr_subgroup_extended_types"


#define CL_KHR_SUBGROUP_EXTENDED_TYPES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_non_uniform_arithmetic
***************************************************************/
#define cl_khr_subgroup_non_uniform_arithmetic 1
#define CL_KHR_SUBGROUP_NON_UNIFORM_ARITHMETIC_EXTENSION_NAME \
    "cl_khr_subgroup_non_uniform_arithmetic"


#define CL_KHR_SUBGROUP_NON_UNIFORM_ARITHMETIC_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_non_uniform_vote
***************************************************************/
#define cl_khr_subgroup_non_uniform_vote 1
#define CL_KHR_SUBGROUP_NON_UNIFORM_VOTE_EXTENSION_NAME \
    "cl_khr_subgroup_non_uniform_vote"


#define CL_KHR_SUBGROUP_NON_UNIFORM_VOTE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_rotate
***************************************************************/
#define cl_khr_subgroup_rotate 1
#define CL_KHR_SUBGROUP_ROTATE_EXTENSION_NAME \
    "cl_khr_subgroup_rotate"


#define CL_KHR_SUBGROUP_ROTATE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_shuffle
***************************************************************/
#define cl_khr_subgroup_shuffle 1
#define CL_KHR_SUBGROUP_SHUFFLE_EXTENSION_NAME \
    "cl_khr_subgroup_shuffle"


#define CL_KHR_SUBGROUP_SHUFFLE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_subgroup_shuffle_relative
***************************************************************/
#define cl_khr_subgroup_shuffle_relative 1
#define CL_KHR_SUBGROUP_SHUFFLE_RELATIVE_EXTENSION_NAME \
    "cl_khr_subgroup_shuffle_relative"


#define CL_KHR_SUBGROUP_SHUFFLE_RELATIVE_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_khr_work_group_uniform_arithmetic
***************************************************************/
#define cl_khr_work_group_uniform_arithmetic 1
#define CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_NAME \
    "cl_khr_work_group_uniform_arithmetic"


#define CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/***************************************************************
* cl_ext_image_unorm_int_2_101010
***************************************************************/
#define cl_ext_image_unorm_int_2_101010 1
#define CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_NAME \
    "cl_ext_image_unorm_int_2_101010"


#define CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_channel_type */
#define CL_UNORM_INT_2_101010_EXT                           0x10E5

/***************************************************************
* cl_img_cancel_command
***************************************************************/
#define cl_img_cancel_command 1
#define CL_IMG_CANCEL_COMMAND_EXTENSION_NAME \
    "cl_img_cancel_command"


#define CL_IMG_CANCEL_COMMAND_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* Error codes */
#define CL_CANCELLED_IMG                                    -1126


typedef cl_int CL_API_CALL
clCancelCommandsIMG_t(
    const cl_event* event_list,
    size_t num_events_in_list);

typedef clCancelCommandsIMG_t *
clCancelCommandsIMG_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clCancelCommandsIMG(
    const cl_event* event_list,
    size_t num_events_in_list) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_EXT_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_ext_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 ******************************************************************************/

#include <CL/cl_ext.h>
#pragma message("The Intel extensions have been moved into cl_ext.h.  Please include cl_ext.h directly.")


================================================
FILE: svm/OpenCL/include/CL/cl_function_types.h
================================================
/*
 * Copyright (c) 2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * OpenCL is a trademark of Apple Inc. used under license by Khronos.
 */

#ifndef OPENCL_CL_FUNCTION_TYPES_H_
#define OPENCL_CL_FUNCTION_TYPES_H_

#include <CL/cl.h>

typedef cl_int CL_API_CALL clGetPlatformIDs_t(
    cl_uint num_entries,
    cl_platform_id* platforms,
    cl_uint* num_platforms);

typedef clGetPlatformIDs_t *
clGetPlatformIDs_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetPlatformInfo_t(
    cl_platform_id platform,
    cl_platform_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetPlatformInfo_t *
clGetPlatformInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetDeviceIDs_t(
    cl_platform_id platform,
    cl_device_type device_type,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDs_t *
clGetDeviceIDs_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetDeviceInfo_t(
    cl_device_id device,
    cl_device_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetDeviceInfo_t *
clGetDeviceInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_context CL_API_CALL clCreateContext_t(
    const cl_context_properties* properties,
    cl_uint num_devices,
    const cl_device_id* devices,
    void (CL_CALLBACK* pfn_notify)(const char* errinfo, const void* private_info, size_t cb, void* user_data),
    void* user_data,
    cl_int* errcode_ret);

typedef clCreateContext_t *
clCreateContext_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_context CL_API_CALL clCreateContextFromType_t(
    const cl_context_properties* properties,
    cl_device_type device_type,
    void (CL_CALLBACK* pfn_notify)(const char* errinfo, const void* private_info, size_t cb, void* user_data),
    void* user_data,
    cl_int* errcode_ret);

typedef clCreateContextFromType_t *
clCreateContextFromType_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainContext_t(
    cl_context context);

typedef clRetainContext_t *
clRetainContext_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseContext_t(
    cl_context context);

typedef clReleaseContext_t *
clReleaseContext_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetContextInfo_t(
    cl_context context,
    cl_context_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetContextInfo_t *
clGetContextInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainCommandQueue_t(
    cl_command_queue command_queue);

typedef clRetainCommandQueue_t *
clRetainCommandQueue_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseCommandQueue_t(
    cl_command_queue command_queue);

typedef clReleaseCommandQueue_t *
clReleaseCommandQueue_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetCommandQueueInfo_t(
    cl_command_queue command_queue,
    cl_command_queue_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetCommandQueueInfo_t *
clGetCommandQueueInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_mem CL_API_CALL clCreateBuffer_t(
    cl_context context,
    cl_mem_flags flags,
    size_t size,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateBuffer_t *
clCreateBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainMemObject_t(
    cl_mem memobj);

typedef clRetainMemObject_t *
clRetainMemObject_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseMemObject_t(
    cl_mem memobj);

typedef clReleaseMemObject_t *
clReleaseMemObject_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetSupportedImageFormats_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint num_entries,
    cl_image_format* image_formats,
    cl_uint* num_image_formats);

typedef clGetSupportedImageFormats_t *
clGetSupportedImageFormats_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetMemObjectInfo_t(
    cl_mem memobj,
    cl_mem_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetMemObjectInfo_t *
clGetMemObjectInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetImageInfo_t(
    cl_mem image,
    cl_image_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetImageInfo_t *
clGetImageInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainSampler_t(
    cl_sampler sampler);

typedef clRetainSampler_t *
clRetainSampler_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseSampler_t(
    cl_sampler sampler);

typedef clReleaseSampler_t *
clReleaseSampler_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetSamplerInfo_t(
    cl_sampler sampler,
    cl_sampler_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetSamplerInfo_t *
clGetSamplerInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_program CL_API_CALL clCreateProgramWithSource_t(
    cl_context context,
    cl_uint count,
    const char** strings,
    const size_t* lengths,
    cl_int* errcode_ret);

typedef clCreateProgramWithSource_t *
clCreateProgramWithSource_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_program CL_API_CALL clCreateProgramWithBinary_t(
    cl_context context,
    cl_uint num_devices,
    const cl_device_id* device_list,
    const size_t* lengths,
    const unsigned char** binaries,
    cl_int* binary_status,
    cl_int* errcode_ret);

typedef clCreateProgramWithBinary_t *
clCreateProgramWithBinary_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainProgram_t(
    cl_program program);

typedef clRetainProgram_t *
clRetainProgram_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseProgram_t(
    cl_program program);

typedef clReleaseProgram_t *
clReleaseProgram_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clBuildProgram_t(
    cl_program program,
    cl_uint num_devices,
    const cl_device_id* device_list,
    const char* options,
    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
    void* user_data);

typedef clBuildProgram_t *
clBuildProgram_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetProgramInfo_t(
    cl_program program,
    cl_program_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetProgramInfo_t *
clGetProgramInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetProgramBuildInfo_t(
    cl_program program,
    cl_device_id device,
    cl_program_build_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetProgramBuildInfo_t *
clGetProgramBuildInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_kernel CL_API_CALL clCreateKernel_t(
    cl_program program,
    const char* kernel_name,
    cl_int* errcode_ret);

typedef clCreateKernel_t *
clCreateKernel_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clCreateKernelsInProgram_t(
    cl_program program,
    cl_uint num_kernels,
    cl_kernel* kernels,
    cl_uint* num_kernels_ret);

typedef clCreateKernelsInProgram_t *
clCreateKernelsInProgram_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainKernel_t(
    cl_kernel kernel);

typedef clRetainKernel_t *
clRetainKernel_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseKernel_t(
    cl_kernel kernel);

typedef clReleaseKernel_t *
clReleaseKernel_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clSetKernelArg_t(
    cl_kernel kernel,
    cl_uint arg_index,
    size_t arg_size,
    const void* arg_value);

typedef clSetKernelArg_t *
clSetKernelArg_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetKernelInfo_t(
    cl_kernel kernel,
    cl_kernel_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetKernelInfo_t *
clGetKernelInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetKernelWorkGroupInfo_t(
    cl_kernel kernel,
    cl_device_id device,
    cl_kernel_work_group_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetKernelWorkGroupInfo_t *
clGetKernelWorkGroupInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clWaitForEvents_t(
    cl_uint num_events,
    const cl_event* event_list);

typedef clWaitForEvents_t *
clWaitForEvents_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetEventInfo_t(
    cl_event event,
    cl_event_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetEventInfo_t *
clGetEventInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clRetainEvent_t(
    cl_event event);

typedef clRetainEvent_t *
clRetainEvent_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clReleaseEvent_t(
    cl_event event);

typedef clReleaseEvent_t *
clReleaseEvent_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clGetEventProfilingInfo_t(
    cl_event event,
    cl_profiling_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetEventProfilingInfo_t *
clGetEventProfilingInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clFlush_t(
    cl_command_queue command_queue);

typedef clFlush_t *
clFlush_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clFinish_t(
    cl_command_queue command_queue);

typedef clFinish_t *
clFinish_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueReadBuffer_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    cl_bool blocking_read,
    size_t offset,
    size_t size,
    void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReadBuffer_t *
clEnqueueReadBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueWriteBuffer_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    cl_bool blocking_write,
    size_t offset,
    size_t size,
    const void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueWriteBuffer_t *
clEnqueueWriteBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueCopyBuffer_t(
    cl_command_queue command_queue,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    size_t src_offset,
    size_t dst_offset,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCopyBuffer_t *
clEnqueueCopyBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueReadImage_t(
    cl_command_queue command_queue,
    cl_mem image,
    cl_bool blocking_read,
    const size_t* origin,
    const size_t* region,
    size_t row_pitch,
    size_t slice_pitch,
    void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReadImage_t *
clEnqueueReadImage_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueWriteImage_t(
    cl_command_queue command_queue,
    cl_mem image,
    cl_bool blocking_write,
    const size_t* origin,
    const size_t* region,
    size_t input_row_pitch,
    size_t input_slice_pitch,
    const void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueWriteImage_t *
clEnqueueWriteImage_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueCopyImage_t(
    cl_command_queue command_queue,
    cl_mem src_image,
    cl_mem dst_image,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCopyImage_t *
clEnqueueCopyImage_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueCopyImageToBuffer_t(
    cl_command_queue command_queue,
    cl_mem src_image,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* region,
    size_t dst_offset,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCopyImageToBuffer_t *
clEnqueueCopyImageToBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueCopyBufferToImage_t(
    cl_command_queue command_queue,
    cl_mem src_buffer,
    cl_mem dst_image,
    size_t src_offset,
    const size_t* dst_origin,
    const size_t* region,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCopyBufferToImage_t *
clEnqueueCopyBufferToImage_fn CL_API_SUFFIX__VERSION_1_0;

typedef void* CL_API_CALL clEnqueueMapBuffer_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    cl_bool blocking_map,
    cl_map_flags map_flags,
    size_t offset,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event,
    cl_int* errcode_ret);

typedef clEnqueueMapBuffer_t *
clEnqueueMapBuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef void* CL_API_CALL clEnqueueMapImage_t(
    cl_command_queue command_queue,
    cl_mem image,
    cl_bool blocking_map,
    cl_map_flags map_flags,
    const size_t* origin,
    const size_t* region,
    size_t* image_row_pitch,
    size_t* image_slice_pitch,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event,
    cl_int* errcode_ret);

typedef clEnqueueMapImage_t *
clEnqueueMapImage_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueUnmapMemObject_t(
    cl_command_queue command_queue,
    cl_mem memobj,
    void* mapped_ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueUnmapMemObject_t *
clEnqueueUnmapMemObject_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueNDRangeKernel_t(
    cl_command_queue command_queue,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t* global_work_offset,
    const size_t* global_work_size,
    const size_t* local_work_size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueNDRangeKernel_t *
clEnqueueNDRangeKernel_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clEnqueueNativeKernel_t(
    cl_command_queue command_queue,
    void (CL_CALLBACK* user_func)(void*),
    void* args,
    size_t cb_args,
    cl_uint num_mem_objects,
    const cl_mem* mem_list,
    const void** args_mem_loc,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueNativeKernel_t *
clEnqueueNativeKernel_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL clSetCommandQueueProperty_t(
    cl_command_queue command_queue,
    cl_command_queue_properties properties,
    cl_bool enable,
    cl_command_queue_properties* old_properties);

typedef clSetCommandQueueProperty_t *
clSetCommandQueueProperty_fn CL_API_SUFFIX__VERSION_1_0_DEPRECATED;

typedef cl_mem CL_API_CALL clCreateImage2D_t(
    cl_context context,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    size_t image_width,
    size_t image_height,
    size_t image_row_pitch,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateImage2D_t *
clCreateImage2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_mem CL_API_CALL clCreateImage3D_t(
    cl_context context,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    size_t image_width,
    size_t image_height,
    size_t image_depth,
    size_t image_row_pitch,
    size_t image_slice_pitch,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateImage3D_t *
clCreateImage3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_int CL_API_CALL clEnqueueMarker_t(
    cl_command_queue command_queue,
    cl_event* event);

typedef clEnqueueMarker_t *
clEnqueueMarker_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_int CL_API_CALL clEnqueueWaitForEvents_t(
    cl_command_queue command_queue,
    cl_uint num_events,
    const cl_event* event_list);

typedef clEnqueueWaitForEvents_t *
clEnqueueWaitForEvents_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_int CL_API_CALL clEnqueueBarrier_t(
    cl_command_queue command_queue);

typedef clEnqueueBarrier_t *
clEnqueueBarrier_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_int CL_API_CALL clUnloadCompiler_t(
    void );

typedef clUnloadCompiler_t *
clUnloadCompiler_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef void* CL_API_CALL clGetExtensionFunctionAddress_t(
    const char* func_name);

typedef clGetExtensionFunctionAddress_t *
clGetExtensionFunctionAddress_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_command_queue CL_API_CALL clCreateCommandQueue_t(
    cl_context context,
    cl_device_id device,
    cl_command_queue_properties properties,
    cl_int* errcode_ret);

typedef clCreateCommandQueue_t *
clCreateCommandQueue_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

typedef cl_sampler CL_API_CALL clCreateSampler_t(
    cl_context context,
    cl_bool normalized_coords,
    cl_addressing_mode addressing_mode,
    cl_filter_mode filter_mode,
    cl_int* errcode_ret);

typedef clCreateSampler_t *
clCreateSampler_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

typedef cl_int CL_API_CALL clEnqueueTask_t(
    cl_command_queue command_queue,
    cl_kernel kernel,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueTask_t *
clEnqueueTask_fn CL_API_SUFFIX__VERSION_1_2_DEPRECATED;

#ifdef CL_VERSION_1_1

typedef cl_mem CL_API_CALL clCreateSubBuffer_t(
    cl_mem buffer,
    cl_mem_flags flags,
    cl_buffer_create_type buffer_create_type,
    const void* buffer_create_info,
    cl_int* errcode_ret);

typedef clCreateSubBuffer_t *
clCreateSubBuffer_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clSetMemObjectDestructorCallback_t(
    cl_mem memobj,
    void (CL_CALLBACK* pfn_notify)(cl_mem memobj, void* user_data),
    void* user_data);

typedef clSetMemObjectDestructorCallback_t *
clSetMemObjectDestructorCallback_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_event CL_API_CALL clCreateUserEvent_t(
    cl_context context,
    cl_int* errcode_ret);

typedef clCreateUserEvent_t *
clCreateUserEvent_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clSetUserEventStatus_t(
    cl_event event,
    cl_int execution_status);

typedef clSetUserEventStatus_t *
clSetUserEventStatus_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clSetEventCallback_t(
    cl_event event,
    cl_int command_exec_callback_type,
    void (CL_CALLBACK* pfn_notify)(cl_event event, cl_int event_command_status, void *user_data),
    void* user_data);

typedef clSetEventCallback_t *
clSetEventCallback_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clEnqueueReadBufferRect_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    cl_bool blocking_read,
    const size_t* buffer_origin,
    const size_t* host_origin,
    const size_t* region,
    size_t buffer_row_pitch,
    size_t buffer_slice_pitch,
    size_t host_row_pitch,
    size_t host_slice_pitch,
    void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReadBufferRect_t *
clEnqueueReadBufferRect_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clEnqueueWriteBufferRect_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    cl_bool blocking_write,
    const size_t* buffer_origin,
    const size_t* host_origin,
    const size_t* region,
    size_t buffer_row_pitch,
    size_t buffer_slice_pitch,
    size_t host_row_pitch,
    size_t host_slice_pitch,
    const void* ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueWriteBufferRect_t *
clEnqueueWriteBufferRect_fn CL_API_SUFFIX__VERSION_1_1;

typedef cl_int CL_API_CALL clEnqueueCopyBufferRect_t(
    cl_command_queue command_queue,
    cl_mem src_buffer,
    cl_mem dst_buffer,
    const size_t* src_origin,
    const size_t* dst_origin,
    const size_t* region,
    size_t src_row_pitch,
    size_t src_slice_pitch,
    size_t dst_row_pitch,
    size_t dst_slice_pitch,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueCopyBufferRect_t *
clEnqueueCopyBufferRect_fn CL_API_SUFFIX__VERSION_1_1;

#endif /* CL_VERSION_1_1 */

#ifdef CL_VERSION_1_2

typedef cl_int CL_API_CALL clCreateSubDevices_t(
    cl_device_id in_device,
    const cl_device_partition_property* properties,
    cl_uint num_devices,
    cl_device_id* out_devices,
    cl_uint* num_devices_ret);

typedef clCreateSubDevices_t *
clCreateSubDevices_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clRetainDevice_t(
    cl_device_id device);

typedef clRetainDevice_t *
clRetainDevice_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clReleaseDevice_t(
    cl_device_id device);

typedef clReleaseDevice_t *
clReleaseDevice_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL clCreateImage_t(
    cl_context context,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    const cl_image_desc* image_desc,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateImage_t *
clCreateImage_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_program CL_API_CALL clCreateProgramWithBuiltInKernels_t(
    cl_context context,
    cl_uint num_devices,
    const cl_device_id* device_list,
    const char* kernel_names,
    cl_int* errcode_ret);

typedef clCreateProgramWithBuiltInKernels_t *
clCreateProgramWithBuiltInKernels_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clCompileProgram_t(
    cl_program program,
    cl_uint num_devices,
    const cl_device_id* device_list,
    const char* options,
    cl_uint num_input_headers,
    const cl_program* input_headers,
    const char** header_include_names,
    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
    void* user_data);

typedef clCompileProgram_t *
clCompileProgram_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_program CL_API_CALL clLinkProgram_t(
    cl_context context,
    cl_uint num_devices,
    const cl_device_id* device_list,
    const char* options,
    cl_uint num_input_programs,
    const cl_program* input_programs,
    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
    void* user_data,
    cl_int* errcode_ret);

typedef clLinkProgram_t *
clLinkProgram_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clUnloadPlatformCompiler_t(
    cl_platform_id platform);

typedef clUnloadPlatformCompiler_t *
clUnloadPlatformCompiler_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clGetKernelArgInfo_t(
    cl_kernel kernel,
    cl_uint arg_index,
    cl_kernel_arg_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetKernelArgInfo_t *
clGetKernelArgInfo_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clEnqueueFillBuffer_t(
    cl_command_queue command_queue,
    cl_mem buffer,
    const void* pattern,
    size_t pattern_size,
    size_t offset,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueFillBuffer_t *
clEnqueueFillBuffer_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clEnqueueFillImage_t(
    cl_command_queue command_queue,
    cl_mem image,
    const void* fill_color,
    const size_t* origin,
    const size_t* region,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueFillImage_t *
clEnqueueFillImage_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clEnqueueMigrateMemObjects_t(
    cl_command_queue command_queue,
    cl_uint num_mem_objects,
    const cl_mem* mem_objects,
    cl_mem_migration_flags flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMigrateMemObjects_t *
clEnqueueMigrateMemObjects_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clEnqueueMarkerWithWaitList_t(
    cl_command_queue command_queue,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueMarkerWithWaitList_t *
clEnqueueMarkerWithWaitList_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL clEnqueueBarrierWithWaitList_t(
    cl_command_queue command_queue,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueBarrierWithWaitList_t *
clEnqueueBarrierWithWaitList_fn CL_API_SUFFIX__VERSION_1_2;

typedef void* CL_API_CALL clGetExtensionFunctionAddressForPlatform_t(
    cl_platform_id platform,
    const char* func_name);

typedef clGetExtensionFunctionAddressForPlatform_t *
clGetExtensionFunctionAddressForPlatform_fn CL_API_SUFFIX__VERSION_1_2;

#endif /* CL_VERSION_1_2 */

#ifdef CL_VERSION_2_0

typedef cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties_t(
    cl_context context,
    cl_device_id device,
    const cl_queue_properties* properties,
    cl_int* errcode_ret);

typedef clCreateCommandQueueWithProperties_t *
clCreateCommandQueueWithProperties_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_mem CL_API_CALL clCreatePipe_t(
    cl_context context,
    cl_mem_flags flags,
    cl_uint pipe_packet_size,
    cl_uint pipe_max_packets,
    const cl_pipe_properties* properties,
    cl_int* errcode_ret);

typedef clCreatePipe_t *
clCreatePipe_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clGetPipeInfo_t(
    cl_mem pipe,
    cl_pipe_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetPipeInfo_t *
clGetPipeInfo_fn CL_API_SUFFIX__VERSION_2_0;

typedef void* CL_API_CALL clSVMAlloc_t(
    cl_context context,
    cl_svm_mem_flags flags,
    size_t size,
    cl_uint alignment);

typedef clSVMAlloc_t *
clSVMAlloc_fn CL_API_SUFFIX__VERSION_2_0;

typedef void CL_API_CALL clSVMFree_t(
    cl_context context,
    void* svm_pointer);

typedef clSVMFree_t *
clSVMFree_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_sampler CL_API_CALL clCreateSamplerWithProperties_t(
    cl_context context,
    const cl_sampler_properties* sampler_properties,
    cl_int* errcode_ret);

typedef clCreateSamplerWithProperties_t *
clCreateSamplerWithProperties_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clSetKernelArgSVMPointer_t(
    cl_kernel kernel,
    cl_uint arg_index,
    const void* arg_value);

typedef clSetKernelArgSVMPointer_t *
clSetKernelArgSVMPointer_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clSetKernelExecInfo_t(
    cl_kernel kernel,
    cl_kernel_exec_info param_name,
    size_t param_value_size,
    const void* param_value);

typedef clSetKernelExecInfo_t *
clSetKernelExecInfo_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clEnqueueSVMFree_t(
    cl_command_queue command_queue,
    cl_uint num_svm_pointers,
    void* svm_pointers[],
    void (CL_CALLBACK* pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void* svm_pointers[], void* user_data),
    void* user_data,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMFree_t *
clEnqueueSVMFree_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clEnqueueSVMMemcpy_t(
    cl_command_queue command_queue,
    cl_bool blocking_copy,
    void* dst_ptr,
    const void* src_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMemcpy_t *
clEnqueueSVMMemcpy_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clEnqueueSVMMemFill_t(
    cl_command_queue command_queue,
    void* svm_ptr,
    const void* pattern,
    size_t pattern_size,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMemFill_t *
clEnqueueSVMMemFill_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clEnqueueSVMMap_t(
    cl_command_queue command_queue,
    cl_bool blocking_map,
    cl_map_flags flags,
    void* svm_ptr,
    size_t size,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMap_t *
clEnqueueSVMMap_fn CL_API_SUFFIX__VERSION_2_0;

typedef cl_int CL_API_CALL clEnqueueSVMUnmap_t(
    cl_command_queue command_queue,
    void* svm_ptr,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMUnmap_t *
clEnqueueSVMUnmap_fn CL_API_SUFFIX__VERSION_2_0;

#endif /* CL_VERSION_2_0 */

#ifdef CL_VERSION_2_1

typedef cl_int CL_API_CALL clSetDefaultDeviceCommandQueue_t(
    cl_context context,
    cl_device_id device,
    cl_command_queue command_queue);

typedef clSetDefaultDeviceCommandQueue_t *
clSetDefaultDeviceCommandQueue_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_int CL_API_CALL clGetDeviceAndHostTimer_t(
    cl_device_id device,
    cl_ulong* device_timestamp,
    cl_ulong* host_timestamp);

typedef clGetDeviceAndHostTimer_t *
clGetDeviceAndHostTimer_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_int CL_API_CALL clGetHostTimer_t(
    cl_device_id device,
    cl_ulong* host_timestamp);

typedef clGetHostTimer_t *
clGetHostTimer_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_program CL_API_CALL clCreateProgramWithIL_t(
    cl_context context,
    const void* il,
    size_t length,
    cl_int* errcode_ret);

typedef clCreateProgramWithIL_t *
clCreateProgramWithIL_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_kernel CL_API_CALL clCloneKernel_t(
    cl_kernel source_kernel,
    cl_int* errcode_ret);

typedef clCloneKernel_t *
clCloneKernel_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_int CL_API_CALL clGetKernelSubGroupInfo_t(
    cl_kernel kernel,
    cl_device_id device,
    cl_kernel_sub_group_info param_name,
    size_t input_value_size,
    const void* input_value,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetKernelSubGroupInfo_t *
clGetKernelSubGroupInfo_fn CL_API_SUFFIX__VERSION_2_1;

typedef cl_int CL_API_CALL clEnqueueSVMMigrateMem_t(
    cl_command_queue command_queue,
    cl_uint num_svm_pointers,
    const void** svm_pointers,
    const size_t* sizes,
    cl_mem_migration_flags flags,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueSVMMigrateMem_t *
clEnqueueSVMMigrateMem_fn CL_API_SUFFIX__VERSION_2_1;

#endif /* CL_VERSION_2_1 */

#ifdef CL_VERSION_2_2

typedef cl_int CL_API_CALL clSetProgramSpecializationConstant_t(
    cl_program program,
    cl_uint spec_id,
    size_t spec_size,
    const void* spec_value);

typedef clSetProgramSpecializationConstant_t *
clSetProgramSpecializationConstant_fn CL_API_SUFFIX__VERSION_2_2;

typedef cl_int CL_API_CALL clSetProgramReleaseCallback_t(
    cl_program program,
    void (CL_CALLBACK* pfn_notify)(cl_program program, void* user_data),
    void* user_data);

typedef clSetProgramReleaseCallback_t *
clSetProgramReleaseCallback_fn CL_API_SUFFIX__VERSION_2_2_DEPRECATED;

#endif /* CL_VERSION_2_2 */

#ifdef CL_VERSION_3_0

typedef cl_int CL_API_CALL clSetContextDestructorCallback_t(
    cl_context context,
    void (CL_CALLBACK* pfn_notify)(cl_context context, void* user_data),
    void* user_data);

typedef clSetContextDestructorCallback_t *
clSetContextDestructorCallback_fn CL_API_SUFFIX__VERSION_3_0;

typedef cl_mem CL_API_CALL clCreateBufferWithProperties_t(
    cl_context context,
    const cl_mem_properties* properties,
    cl_mem_flags flags,
    size_t size,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateBufferWithProperties_t *
clCreateBufferWithProperties_fn CL_API_SUFFIX__VERSION_3_0;

typedef cl_mem CL_API_CALL clCreateImageWithProperties_t(
    cl_context context,
    const cl_mem_properties* properties,
    cl_mem_flags flags,
    const cl_image_format* image_format,
    const cl_image_desc* image_desc,
    void* host_ptr,
    cl_int* errcode_ret);

typedef clCreateImageWithProperties_t *
clCreateImageWithProperties_fn CL_API_SUFFIX__VERSION_3_0;

#endif /* CL_VERSION_3_0 */

#endif /* OPENCL_CL_FUNCTION_TYPES_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_gl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_GL_H_
#define OPENCL_CL_GL_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_khr_gl_sharing
***************************************************************/
#define cl_khr_gl_sharing 1
#define CL_KHR_GL_SHARING_EXTENSION_NAME \
    "cl_khr_gl_sharing"


#define CL_KHR_GL_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef int                 cl_GLint;
typedef unsigned int        cl_GLenum;
typedef unsigned int        cl_GLuint;

typedef cl_uint             cl_gl_context_info;

/* Error codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR              -1000

/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR                0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR                       0x2007

/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR                                   0x2008
#define CL_EGL_DISPLAY_KHR                                  0x2009
#define CL_GLX_DISPLAY_KHR                                  0x200A
#define CL_WGL_HDC_KHR                                      0x200B
#define CL_CGL_SHAREGROUP_KHR                               0x200C

typedef cl_uint             cl_gl_object_type;
typedef cl_uint             cl_gl_texture_info;
typedef cl_uint             cl_gl_platform_info;

/* cl_gl_object_type */
#define CL_GL_OBJECT_BUFFER                                 0x2000
#define CL_GL_OBJECT_TEXTURE2D                              0x2001
#define CL_GL_OBJECT_TEXTURE3D                              0x2002
#define CL_GL_OBJECT_RENDERBUFFER                           0x2003

#if defined(CL_VERSION_1_2)
/* cl_gl_object_type */
#define CL_GL_OBJECT_TEXTURE2D_ARRAY                        0x200E
#define CL_GL_OBJECT_TEXTURE1D                              0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY                        0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER                         0x2011

#endif /* defined(CL_VERSION_1_2) */

/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET                                0x2004
#define CL_GL_MIPMAP_LEVEL                                  0x2005


typedef cl_int CL_API_CALL
clGetGLContextInfoKHR_t(
    const cl_context_properties* properties,
    cl_gl_context_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetGLContextInfoKHR_t *
clGetGLContextInfoKHR_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_mem CL_API_CALL
clCreateFromGLBuffer_t(
    cl_context context,
    cl_mem_flags flags,
    cl_GLuint bufobj,
    cl_int* errcode_ret);

typedef clCreateFromGLBuffer_t *
clCreateFromGLBuffer_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(
    const cl_context_properties* properties,
    cl_gl_context_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(
    cl_context context,
    cl_mem_flags flags,
    cl_GLuint bufobj,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#if defined(CL_VERSION_1_2)

typedef cl_mem CL_API_CALL
clCreateFromGLTexture_t(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret);

typedef clCreateFromGLTexture_t *
clCreateFromGLTexture_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#endif /* defined(CL_VERSION_1_2) */


typedef cl_mem CL_API_CALL
clCreateFromGLRenderbuffer_t(
    cl_context context,
    cl_mem_flags flags,
    cl_GLuint renderbuffer,
    cl_int* errcode_ret);

typedef clCreateFromGLRenderbuffer_t *
clCreateFromGLRenderbuffer_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clGetGLObjectInfo_t(
    cl_mem memobj,
    cl_gl_object_type* gl_object_type,
    cl_GLuint* gl_object_name);

typedef clGetGLObjectInfo_t *
clGetGLObjectInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clGetGLTextureInfo_t(
    cl_mem memobj,
    cl_gl_texture_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetGLTextureInfo_t *
clGetGLTextureInfo_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueAcquireGLObjects_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireGLObjects_t *
clEnqueueAcquireGLObjects_fn CL_API_SUFFIX__VERSION_1_0;

typedef cl_int CL_API_CALL
clEnqueueReleaseGLObjects_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseGLObjects_t *
clEnqueueReleaseGLObjects_fn CL_API_SUFFIX__VERSION_1_0;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(
    cl_context context,
    cl_mem_flags flags,
    cl_GLuint renderbuffer,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(
    cl_mem memobj,
    cl_gl_object_type* gl_object_type,
    cl_GLuint* gl_object_name) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(
    cl_mem memobj,
    cl_gl_texture_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_0;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/* OpenCL 1.0 APIs that were deprecated in OpenCL 1.2 */

typedef cl_mem CL_API_CALL
clCreateFromGLTexture2D_t(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret);

typedef clCreateFromGLTexture2D_t *
clCreateFromGLTexture2D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

typedef cl_mem CL_API_CALL
clCreateFromGLTexture3D_t(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret);

typedef clCreateFromGLTexture3D_t *
clCreateFromGLTexture3D_fn CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture2D(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture3D(
    cl_context context,
    cl_mem_flags flags,
    cl_GLenum target,
    cl_GLint miplevel,
    cl_GLuint texture,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1_DEPRECATED;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_gl_event
***************************************************************/
#define cl_khr_gl_event 1
#define CL_KHR_GL_EVENT_EXTENSION_NAME \
    "cl_khr_gl_event"


#define CL_KHR_GL_EVENT_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef struct __GLsync *   cl_GLsync;

/* cl_command_type */
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR                 0x200D


typedef cl_event CL_API_CALL
clCreateEventFromGLsyncKHR_t(
    cl_context context,
    cl_GLsync sync,
    cl_int* errcode_ret);

typedef clCreateEventFromGLsyncKHR_t *
clCreateEventFromGLsyncKHR_fn CL_API_SUFFIX__VERSION_1_1;

#if !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(
    cl_context context,
    cl_GLsync sync,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_1;

#endif /* !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_khr_gl_depth_images
***************************************************************/
#define cl_khr_gl_depth_images 1
#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_NAME \
    "cl_khr_gl_depth_images"


#define CL_KHR_GL_DEPTH_IMAGES_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_channel_order */
#define CL_DEPTH_STENCIL                                    0x10BE

/* cl_channel_type */
#define CL_UNORM_INT24                                      0x10DF

/***************************************************************
* cl_khr_gl_msaa_sharing
***************************************************************/
#define cl_khr_gl_msaa_sharing 1
#define CL_KHR_GL_MSAA_SHARING_EXTENSION_NAME \
    "cl_khr_gl_msaa_sharing"


#define CL_KHR_GL_MSAA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

/* cl_gl_texture_info */
#define CL_GL_NUM_SAMPLES                                   0x2012

/***************************************************************
* cl_intel_sharing_format_query_gl
***************************************************************/
#define cl_intel_sharing_format_query_gl 1
#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME \
    "cl_intel_sharing_format_query_gl"


#define CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* when cl_khr_gl_sharing is supported */

typedef cl_int CL_API_CALL
clGetSupportedGLTextureFormatsINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint num_entries,
    cl_GLenum* gl_formats,
    cl_uint* num_texture_formats);

typedef clGetSupportedGLTextureFormatsINTEL_t *
clGetSupportedGLTextureFormatsINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedGLTextureFormatsINTEL(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint num_entries,
    cl_GLenum* gl_formats,
    cl_uint* num_texture_formats) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_GL_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_gl_ext.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2021 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#include <CL/cl_gl.h>
#pragma message("The extensions in cl_gl_ext.h have been moved into cl_gl.h.  Please include cl_gl.h directly.")


================================================
FILE: svm/OpenCL/include/CL/cl_half.h
================================================
/*******************************************************************************
 * Copyright (c) 2019-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

/**
 * This is a header-only utility library that provides OpenCL host code with
 * routines for converting to/from cl_half values.
 *
 * Example usage:
 *
 *    #include <CL/cl_half.h>
 *    ...
 *    cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
 *    cl_float f = cl_half_to_float(h);
 */

#ifndef OPENCL_CL_HALF_H
#define OPENCL_CL_HALF_H

#include <CL/cl_platform.h>

#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif


/**
 * Rounding mode used when converting to cl_half.
 */
typedef enum
{
  CL_HALF_RTE, // round to nearest even
  CL_HALF_RTZ, // round towards zero
  CL_HALF_RTP, // round towards positive infinity
  CL_HALF_RTN, // round towards negative infinity
} cl_half_rounding_mode;


/* Private utility macros. */
#define CL_HALF_EXP_MASK 0x7C00
#define CL_HALF_MAX_FINITE_MAG 0x7BFF


/*
 * Utility to deal with values that overflow when converting to half precision.
 */
static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
                                              uint16_t sign)
{
  if (rounding_mode == CL_HALF_RTZ)
  {
    // Round overflow towards zero -> largest finite number (preserving sign)
    return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTP && sign)
  {
    // Round negative overflow towards positive infinity -> most negative finite number
    return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
  }
  else if (rounding_mode == CL_HALF_RTN && !sign)
  {
    // Round positive overflow towards negative infinity -> largest finite number
    return CL_HALF_MAX_FINITE_MAG;
  }

  // Overflow to infinity
  return (sign << 15) | CL_HALF_EXP_MASK;
}

/*
 * Utility to deal with values that underflow when converting to half precision.
 */
static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
                                               uint16_t sign)
{
  if (rounding_mode == CL_HALF_RTP && !sign)
  {
    // Round underflow towards positive infinity -> smallest positive value
    return (sign << 15) | 1;
  }
  else if (rounding_mode == CL_HALF_RTN && sign)
  {
    // Round underflow towards negative infinity -> largest negative value
    return (sign << 15) | 1;
  }

  // Flush to zero
  return (sign << 15);
}


/**
 * Convert a cl_float to a cl_half.
 */
static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;
  f32.f = f;

  // Extract sign bit
  uint16_t sign = f32.i >> 31;

  // Extract FP32 exponent and mantissa
  uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
  uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);

  // Remove FP32 exponent bias
  int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;

  // Add FP16 exponent bias
  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);

  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;

  // Check for NaN / infinity
  if (f_exp == 0xFF)
  {
    if (f_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }

  // Check for zero
  if (!f_exp && !f_mant)
  {
    return (sign << 15);
  }

  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }

  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }

  // Check for value that will become denormal
  if (exp < -14)
  {
    // Denormal -> include the implicit 1 from the FP32 mantissa
    h_exp = 0;
    f_mant |= 1 << (CL_FLT_MANT_DIG - 1);

    // Mantissa shift amount depends on exponent
    lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
  }

  // Generate FP16 mantissa by shifting FP32 mantissa
  uint16_t h_mant = (uint16_t)(f_mant >> lsb_pos);

  // Check whether we need to round
  uint32_t halfway = 1 << (lsb_pos - 1);
  uint32_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((f_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((f_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((f_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((f_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }

  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }

  return (sign << 15) | (h_exp << 10) | h_mant;
}


/**
 * Convert a cl_double to a cl_half.
 */
static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_double d;
    uint64_t i;
  } f64;
  f64.d = d;

  // Extract sign bit
  uint16_t sign = f64.i >> 63;

  // Extract FP64 exponent and mantissa
  uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
  uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);

  // Remove FP64 exponent bias
  int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;

  // Add FP16 exponent bias
  uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);

  // Position of the bit that will become the FP16 mantissa LSB
  uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;

  // Check for NaN / infinity
  if (d_exp == 0x7FF)
  {
    if (d_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
      h_mant |= 0x200;
      return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
    }
    else
    {
      // Infinity -> zero mantissa
      return (sign << 15) | CL_HALF_EXP_MASK;
    }
  }

  // Check for zero
  if (!d_exp && !d_mant)
  {
    return (sign << 15);
  }

  // Check for overflow
  if (exp >= CL_HALF_MAX_EXP)
  {
    return cl_half_handle_overflow(rounding_mode, sign);
  }

  // Check for underflow
  if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
  {
    return cl_half_handle_underflow(rounding_mode, sign);
  }

  // Check for value that will become denormal
  if (exp < -14)
  {
    // Include the implicit 1 from the FP64 mantissa
    h_exp = 0;
    d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);

    // Mantissa shift amount depends on exponent
    lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
  }

  // Generate FP16 mantissa by shifting FP64 mantissa
  uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);

  // Check whether we need to round
  uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
  uint64_t mask = (halfway << 1) - 1;
  switch (rounding_mode)
  {
    case CL_HALF_RTE:
      if ((d_mant & mask) > halfway)
      {
        // More than halfway -> round up
        h_mant += 1;
      }
      else if ((d_mant & mask) == halfway)
      {
        // Exactly halfway -> round to nearest even
        if (h_mant & 0x1)
          h_mant += 1;
      }
      break;
    case CL_HALF_RTZ:
      // Mantissa has already been truncated -> do nothing
      break;
    case CL_HALF_RTP:
      if ((d_mant & mask) && !sign)
      {
        // Round positive numbers up
        h_mant += 1;
      }
      break;
    case CL_HALF_RTN:
      if ((d_mant & mask) && sign)
      {
        // Round negative numbers down
        h_mant += 1;
      }
      break;
  }

  // Check for mantissa overflow
  if (h_mant & 0x400)
  {
    h_exp += 1;
    h_mant = 0;
  }

  return (sign << 15) | (h_exp << 10) | h_mant;
}


/**
 * Convert a cl_half to a cl_float.
 */
static inline cl_float cl_half_to_float(cl_half h)
{
  // Type-punning to get direct access to underlying bits
  union
  {
    cl_float f;
    uint32_t i;
  } f32;

  // Extract sign bit
  uint16_t sign = h >> 15;

  // Extract FP16 exponent and mantissa
  uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
  uint16_t h_mant = h & 0x3FF;

  // Remove FP16 exponent bias
  int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;

  // Add FP32 exponent bias
  uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;

  // Check for NaN / infinity
  if (h_exp == 0x1F)
  {
    if (h_mant)
    {
      // NaN -> propagate mantissa and silence it
      uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
      f_mant |= 0x400000;
      f32.i = (sign << 31) | 0x7F800000 | f_mant;
      return f32.f;
    }
    else
    {
      // Infinity -> zero mantissa
      f32.i = (sign << 31) | 0x7F800000;
      return f32.f;
    }
  }

  // Check for zero / denormal
  if (h_exp == 0)
  {
    if (h_mant == 0)
    {
      // Zero -> zero exponent
      f_exp = 0;
    }
    else
    {
      // Denormal -> normalize it
      // - Shift mantissa to make most-significant 1 implicit
      // - Adjust exponent accordingly
      uint32_t shift = 0;
      while ((h_mant & 0x400) == 0)
      {
        h_mant <<= 1;
        shift++;
      }
      h_mant &= 0x3FF;
      f_exp -= shift - 1;
    }
  }

  f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
  return f32.f;
}


#undef CL_HALF_EXP_MASK
#undef CL_HALF_MAX_FINITE_MAG


#ifdef __cplusplus
}
#endif


#endif  /* OPENCL_CL_HALF_H */


================================================
FILE: svm/OpenCL/include/CL/cl_icd.h
================================================
/*******************************************************************************
 * Copyright (c) 2019-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_ICD_H
#define OPENCL_CL_ICD_H

#include <CL/cl.h>
#include <CL/cl_function_types.h>
#include <CL/cl_egl.h>
#include <CL/cl_ext.h>
#include <CL/cl_gl.h>

#if defined(_WIN32)
#include <CL/cl_d3d11.h>
#include <CL/cl_d3d10.h>
#include <CL/cl_dx9_media_sharing.h>
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* Vendor dispatch table structure */

typedef struct _cl_icd_dispatch {
  /* OpenCL 1.0 */
  clGetPlatformIDs_t *clGetPlatformIDs;
  clGetPlatformInfo_t *clGetPlatformInfo;
  clGetDeviceIDs_t *clGetDeviceIDs;
  clGetDeviceInfo_t *clGetDeviceInfo;
  clCreateContext_t *clCreateContext;
  clCreateContextFromType_t *clCreateContextFromType;
  clRetainContext_t *clRetainContext;
  clReleaseContext_t *clReleaseContext;
  clGetContextInfo_t *clGetContextInfo;
  clCreateCommandQueue_t *clCreateCommandQueue;
  clRetainCommandQueue_t *clRetainCommandQueue;
  clReleaseCommandQueue_t *clReleaseCommandQueue;
  clGetCommandQueueInfo_t *clGetCommandQueueInfo;
  clSetCommandQueueProperty_t *clSetCommandQueueProperty;
  clCreateBuffer_t *clCreateBuffer;
  clCreateImage2D_t *clCreateImage2D;
  clCreateImage3D_t *clCreateImage3D;
  clRetainMemObject_t *clRetainMemObject;
  clReleaseMemObject_t *clReleaseMemObject;
  clGetSupportedImageFormats_t *clGetSupportedImageFormats;
  clGetMemObjectInfo_t *clGetMemObjectInfo;
  clGetImageInfo_t *clGetImageInfo;
  clCreateSampler_t *clCreateSampler;
  clRetainSampler_t *clRetainSampler;
  clReleaseSampler_t *clReleaseSampler;
  clGetSamplerInfo_t *clGetSamplerInfo;
  clCreateProgramWithSource_t *clCreateProgramWithSource;
  clCreateProgramWithBinary_t *clCreateProgramWithBinary;
  clRetainProgram_t *clRetainProgram;
  clReleaseProgram_t *clReleaseProgram;
  clBuildProgram_t *clBuildProgram;
  clUnloadCompiler_t *clUnloadCompiler;
  clGetProgramInfo_t *clGetProgramInfo;
  clGetProgramBuildInfo_t *clGetProgramBuildInfo;
  clCreateKernel_t *clCreateKernel;
  clCreateKernelsInProgram_t *clCreateKernelsInProgram;
  clRetainKernel_t *clRetainKernel;
  clReleaseKernel_t *clReleaseKernel;
  clSetKernelArg_t *clSetKernelArg;
  clGetKernelInfo_t *clGetKernelInfo;
  clGetKernelWorkGroupInfo_t *clGetKernelWorkGroupInfo;
  clWaitForEvents_t *clWaitForEvents;
  clGetEventInfo_t *clGetEventInfo;
  clRetainEvent_t *clRetainEvent;
  clReleaseEvent_t *clReleaseEvent;
  clGetEventProfilingInfo_t *clGetEventProfilingInfo;
  clFlush_t *clFlush;
  clFinish_t *clFinish;
  clEnqueueReadBuffer_t *clEnqueueReadBuffer;
  clEnqueueWriteBuffer_t *clEnqueueWriteBuffer;
  clEnqueueCopyBuffer_t *clEnqueueCopyBuffer;
  clEnqueueReadImage_t *clEnqueueReadImage;
  clEnqueueWriteImage_t *clEnqueueWriteImage;
  clEnqueueCopyImage_t *clEnqueueCopyImage;
  clEnqueueCopyImageToBuffer_t *clEnqueueCopyImageToBuffer;
  clEnqueueCopyBufferToImage_t *clEnqueueCopyBufferToImage;
  clEnqueueMapBuffer_t *clEnqueueMapBuffer;
  clEnqueueMapImage_t *clEnqueueMapImage;
  clEnqueueUnmapMemObject_t *clEnqueueUnmapMemObject;
  clEnqueueNDRangeKernel_t *clEnqueueNDRangeKernel;
  clEnqueueTask_t *clEnqueueTask;
  clEnqueueNativeKernel_t *clEnqueueNativeKernel;
  clEnqueueMarker_t *clEnqueueMarker;
  clEnqueueWaitForEvents_t *clEnqueueWaitForEvents;
  clEnqueueBarrier_t *clEnqueueBarrier;
  clGetExtensionFunctionAddress_t *clGetExtensionFunctionAddress;
  clCreateFromGLBuffer_t *clCreateFromGLBuffer;
  clCreateFromGLTexture2D_t *clCreateFromGLTexture2D;
  clCreateFromGLTexture3D_t *clCreateFromGLTexture3D;
  clCreateFromGLRenderbuffer_t *clCreateFromGLRenderbuffer;
  clGetGLObjectInfo_t *clGetGLObjectInfo;
  clGetGLTextureInfo_t *clGetGLTextureInfo;
  clEnqueueAcquireGLObjects_t *clEnqueueAcquireGLObjects;
  clEnqueueReleaseGLObjects_t *clEnqueueReleaseGLObjects;
  clGetGLContextInfoKHR_t *clGetGLContextInfoKHR;

  /* cl_khr_d3d10_sharing */
#ifdef _WIN32
  clGetDeviceIDsFromD3D10KHR_t *clGetDeviceIDsFromD3D10KHR;
  clCreateFromD3D10BufferKHR_t *clCreateFromD3D10BufferKHR;
  clCreateFromD3D10Texture2DKHR_t *clCreateFromD3D10Texture2DKHR;
  clCreateFromD3D10Texture3DKHR_t *clCreateFromD3D10Texture3DKHR;
  clEnqueueAcquireD3D10ObjectsKHR_t *clEnqueueAcquireD3D10ObjectsKHR;
  clEnqueueReleaseD3D10ObjectsKHR_t *clEnqueueReleaseD3D10ObjectsKHR;
#else
  void *clGetDeviceIDsFromD3D10KHR;
  void *clCreateFromD3D10BufferKHR;
  void *clCreateFromD3D10Texture2DKHR;
  void *clCreateFromD3D10Texture3DKHR;
  void *clEnqueueAcquireD3D10ObjectsKHR;
  void *clEnqueueReleaseD3D10ObjectsKHR;
#endif

  /* OpenCL 1.1 */
#ifdef CL_VERSION_1_1
  clSetEventCallback_t *clSetEventCallback;
  clCreateSubBuffer_t *clCreateSubBuffer;
  clSetMemObjectDestructorCallback_t *clSetMemObjectDestructorCallback;
  clCreateUserEvent_t *clCreateUserEvent;
  clSetUserEventStatus_t *clSetUserEventStatus;
  clEnqueueReadBufferRect_t *clEnqueueReadBufferRect;
  clEnqueueWriteBufferRect_t *clEnqueueWriteBufferRect;
  clEnqueueCopyBufferRect_t *clEnqueueCopyBufferRect;
#else
  void *clSetEventCallback;
  void *clCreateSubBuffer;
  void *clSetMemObjectDestructorCallback;
  void *clCreateUserEvent;
  void *clSetUserEventStatus;
  void *clEnqueueReadBufferRect;
  void *clEnqueueWriteBufferRect;
  void *clEnqueueCopyBufferRect;
#endif

  /* cl_ext_device_fission */
  clCreateSubDevicesEXT_t *clCreateSubDevicesEXT;
  clRetainDeviceEXT_t *clRetainDeviceEXT;
  clReleaseDeviceEXT_t *clReleaseDeviceEXT;

  /* cl_khr_gl_event */
  clCreateEventFromGLsyncKHR_t *clCreateEventFromGLsyncKHR;

  /* OpenCL 1.2 */
#ifdef CL_VERSION_1_2
  clCreateSubDevices_t *clCreateSubDevices;
  clRetainDevice_t *clRetainDevice;
  clReleaseDevice_t *clReleaseDevice;
  clCreateImage_t *clCreateImage;
  clCreateProgramWithBuiltInKernels_t *clCreateProgramWithBuiltInKernels;
  clCompileProgram_t *clCompileProgram;
  clLinkProgram_t *clLinkProgram;
  clUnloadPlatformCompiler_t *clUnloadPlatformCompiler;
  clGetKernelArgInfo_t *clGetKernelArgInfo;
  clEnqueueFillBuffer_t *clEnqueueFillBuffer;
  clEnqueueFillImage_t *clEnqueueFillImage;
  clEnqueueMigrateMemObjects_t *clEnqueueMigrateMemObjects;
  clEnqueueMarkerWithWaitList_t *clEnqueueMarkerWithWaitList;
  clEnqueueBarrierWithWaitList_t *clEnqueueBarrierWithWaitList;
  clGetExtensionFunctionAddressForPlatform_t *
      clGetExtensionFunctionAddressForPlatform;
  clCreateFromGLTexture_t *clCreateFromGLTexture;
#else
  void *clCreateSubDevices;
  void *clRetainDevice;
  void *clReleaseDevice;
  void *clCreateImage;
  void *clCreateProgramWithBuiltInKernels;
  void *clCompileProgram;
  void *clLinkProgram;
  void *clUnloadPlatformCompiler;
  void *clGetKernelArgInfo;
  void *clEnqueueFillBuffer;
  void *clEnqueueFillImage;
  void *clEnqueueMigrateMemObjects;
  void *clEnqueueMarkerWithWaitList;
  void *clEnqueueBarrierWithWaitList;
  void *clGetExtensionFunctionAddressForPlatform;
  void *clCreateFromGLTexture;
#endif

  /* cl_khr_d3d11_sharing and cl_khr_dx9_media_sharing */
#ifdef _WIN32
  clGetDeviceIDsFromD3D11KHR_t *clGetDeviceIDsFromD3D11KHR;
  clCreateFromD3D11BufferKHR_t *clCreateFromD3D11BufferKHR;
  clCreateFromD3D11Texture2DKHR_t *clCreateFromD3D11Texture2DKHR;
  clCreateFromD3D11Texture3DKHR_t *clCreateFromD3D11Texture3DKHR;
  clCreateFromDX9MediaSurfaceKHR_t *clCreateFromDX9MediaSurfaceKHR;
  clEnqueueAcquireD3D11ObjectsKHR_t *clEnqueueAcquireD3D11ObjectsKHR;
  clEnqueueReleaseD3D11ObjectsKHR_t *clEnqueueReleaseD3D11ObjectsKHR;
  clGetDeviceIDsFromDX9MediaAdapterKHR_t *
      clGetDeviceIDsFromDX9MediaAdapterKHR;
  clEnqueueAcquireDX9MediaSurfacesKHR_t *
      clEnqueueAcquireDX9MediaSurfacesKHR;
  clEnqueueReleaseDX9MediaSurfacesKHR_t *
      clEnqueueReleaseDX9MediaSurfacesKHR;
#else
  void *clGetDeviceIDsFromD3D11KHR;
  void *clCreateFromD3D11BufferKHR;
  void *clCreateFromD3D11Texture2DKHR;
  void *clCreateFromD3D11Texture3DKHR;
  void *clCreateFromDX9MediaSurfaceKHR;
  void *clEnqueueAcquireD3D11ObjectsKHR;
  void *clEnqueueReleaseD3D11ObjectsKHR;
  void *clGetDeviceIDsFromDX9MediaAdapterKHR;
  void *clEnqueueAcquireDX9MediaSurfacesKHR;
  void *clEnqueueReleaseDX9MediaSurfacesKHR;
#endif

  /* cl_khr_egl_image */
  clCreateFromEGLImageKHR_t *clCreateFromEGLImageKHR;
  clEnqueueAcquireEGLObjectsKHR_t *clEnqueueAcquireEGLObjectsKHR;
  clEnqueueReleaseEGLObjectsKHR_t *clEnqueueReleaseEGLObjectsKHR;

  /* cl_khr_egl_event */
  clCreateEventFromEGLSyncKHR_t *clCreateEventFromEGLSyncKHR;

  /* OpenCL 2.0 */
#ifdef CL_VERSION_2_0
  clCreateCommandQueueWithProperties_t *clCreateCommandQueueWithProperties;
  clCreatePipe_t *clCreatePipe;
  clGetPipeInfo_t *clGetPipeInfo;
  clSVMAlloc_t *clSVMAlloc;
  clSVMFree_t *clSVMFree;
  clEnqueueSVMFree_t *clEnqueueSVMFree;
  clEnqueueSVMMemcpy_t *clEnqueueSVMMemcpy;
  clEnqueueSVMMemFill_t *clEnqueueSVMMemFill;
  clEnqueueSVMMap_t *clEnqueueSVMMap;
  clEnqueueSVMUnmap_t *clEnqueueSVMUnmap;
  clCreateSamplerWithProperties_t *clCreateSamplerWithProperties;
  clSetKernelArgSVMPointer_t *clSetKernelArgSVMPointer;
  clSetKernelExecInfo_t *clSetKernelExecInfo;
#else
  void *clCreateCommandQueueWithProperties;
  void *clCreatePipe;
  void *clGetPipeInfo;
  void *clSVMAlloc;
  void *clSVMFree;
  void *clEnqueueSVMFree;
  void *clEnqueueSVMMemcpy;
  void *clEnqueueSVMMemFill;
  void *clEnqueueSVMMap;
  void *clEnqueueSVMUnmap;
  void *clCreateSamplerWithProperties;
  void *clSetKernelArgSVMPointer;
  void *clSetKernelExecInfo;
#endif

  /* cl_khr_sub_groups */
  clGetKernelSubGroupInfoKHR_t *clGetKernelSubGroupInfoKHR;

  /* OpenCL 2.1 */
#ifdef CL_VERSION_2_1
  clCloneKernel_t *clCloneKernel;
  clCreateProgramWithIL_t *clCreateProgramWithIL;
  clEnqueueSVMMigrateMem_t *clEnqueueSVMMigrateMem;
  clGetDeviceAndHostTimer_t *clGetDeviceAndHostTimer;
  clGetHostTimer_t *clGetHostTimer;
  clGetKernelSubGroupInfo_t *clGetKernelSubGroupInfo;
  clSetDefaultDeviceCommandQueue_t *clSetDefaultDeviceCommandQueue;
#else
  void *clCloneKernel;
  void *clCreateProgramWithIL;
  void *clEnqueueSVMMigrateMem;
  void *clGetDeviceAndHostTimer;
  void *clGetHostTimer;
  void *clGetKernelSubGroupInfo;
  void *clSetDefaultDeviceCommandQueue;
#endif

  /* OpenCL 2.2 */
#ifdef CL_VERSION_2_2
  clSetProgramReleaseCallback_t *clSetProgramReleaseCallback;
  clSetProgramSpecializationConstant_t *clSetProgramSpecializationConstant;
#else
  void *clSetProgramReleaseCallback;
  void *clSetProgramSpecializationConstant;
#endif

  /* OpenCL 3.0 */
#ifdef CL_VERSION_3_0
  clCreateBufferWithProperties_t *clCreateBufferWithProperties;
  clCreateImageWithProperties_t *clCreateImageWithProperties;
  clSetContextDestructorCallback_t *clSetContextDestructorCallback;
#else
  void *clCreateBufferWithProperties;
  void *clCreateImageWithProperties;
  void *clSetContextDestructorCallback;
#endif

} cl_icd_dispatch;

#ifdef __cplusplus
}
#endif

#endif /* #ifndef OPENCL_CL_ICD_H */


================================================
FILE: svm/OpenCL/include/CL/cl_layer.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_LAYER_H_
#define OPENCL_CL_LAYER_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#include <CL/cl_icd.h>

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_loader_layers
***************************************************************/
#define cl_loader_layers 1
#define CL_LOADER_LAYERS_EXTENSION_NAME \
    "cl_loader_layers"


#define CL_LOADER_LAYERS_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)

typedef cl_uint             cl_layer_info;
typedef cl_uint             cl_layer_api_version;

/* cl_layer_info */
#define CL_LAYER_API_VERSION                                0x4240
#define CL_LAYER_NAME                                       0x4241

/* Misc API enums */
#define CL_LAYER_API_VERSION_100                            100


typedef cl_int CL_API_CALL
clGetLayerInfo_t(
    cl_layer_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret);

typedef clGetLayerInfo_t *
clGetLayerInfo_fn ;

typedef cl_int CL_API_CALL
clInitLayer_t(
    cl_uint num_entries,
    const cl_icd_dispatch* target_dispatch,
    cl_uint* num_entries_ret,
    const cl_icd_dispatch** layer_dispatch_ret);

typedef clInitLayer_t *
clInitLayer_fn ;

/*
** The function pointer typedefs prefixed with "pfn_" are provided for
** compatibility with earlier versions of the headers.  New code is
** encouraged to use the function pointer typedefs that are suffixed with
** "_fn" instead, for consistency.
*/

typedef clGetLayerInfo_t *
pfn_clGetLayerInfo ;

typedef clInitLayer_t *
pfn_clInitLayer ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetLayerInfo(
    cl_layer_info param_name,
    size_t param_value_size,
    void* param_value,
    size_t* param_value_size_ret) ;

extern CL_API_ENTRY cl_int CL_API_CALL
clInitLayer(
    cl_uint num_entries,
    const cl_icd_dispatch* target_dispatch,
    cl_uint* num_entries_ret,
    const cl_icd_dispatch** layer_dispatch_ret) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_LAYER_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_platform.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __CL_PLATFORM_H
#define __CL_PLATFORM_H

#include <CL/cl_version.h>

#ifdef __cplusplus
extern "C" {
#endif

#if defined(_WIN32)
    #if !defined(CL_API_ENTRY)
        #define CL_API_ENTRY
    #endif
    #if !defined(CL_API_CALL)
        #define CL_API_CALL     __stdcall
    #endif
    #if !defined(CL_CALLBACK)
        #define CL_CALLBACK     __stdcall
    #endif
#else
    #if !defined(CL_API_ENTRY)
        #define CL_API_ENTRY
    #endif
    #if !defined(CL_API_CALL)
        #define CL_API_CALL
    #endif
    #if !defined(CL_CALLBACK)
        #define CL_CALLBACK
    #endif
#endif

/*
 * Deprecation flags refer to the last version of the header in which the
 * feature was not deprecated.
 *
 * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
 * deprecation but is deprecated in versions later than 1.1.
 */

#ifndef CL_API_SUFFIX_USER
#define CL_API_SUFFIX_USER
#endif

#ifndef CL_API_PREFIX_USER
#define CL_API_PREFIX_USER
#endif

#define CL_API_SUFFIX_COMMON CL_API_SUFFIX_USER
#define CL_API_PREFIX_COMMON CL_API_PREFIX_USER

#define CL_API_SUFFIX__VERSION_1_0 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_1_1 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_1_2 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_2_0 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_2_1 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_2_2 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__VERSION_3_0 CL_API_SUFFIX_COMMON
#define CL_API_SUFFIX__EXPERIMENTAL CL_API_SUFFIX_COMMON


#ifdef __GNUC__
  #define CL_API_SUFFIX_DEPRECATED __attribute__((deprecated))
  #define CL_API_PREFIX_DEPRECATED
#elif defined(_MSC_VER) && !defined(__clang__)
  #define CL_API_SUFFIX_DEPRECATED
  #define CL_API_PREFIX_DEPRECATED __declspec(deprecated)
#else
  #define CL_API_SUFFIX_DEPRECATED
  #define CL_API_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_1_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_1_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_1_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_1_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_1_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_1_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
 #endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_2_0_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_2_0_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_2_1_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_2_1_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
#endif

#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON
    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON
#else
    #define CL_API_SUFFIX__VERSION_2_2_DEPRECATED CL_API_SUFFIX_COMMON CL_API_SUFFIX_DEPRECATED
    #define CL_API_PREFIX__VERSION_2_2_DEPRECATED CL_API_PREFIX_COMMON CL_API_PREFIX_DEPRECATED
#endif

#if (defined (_WIN32) && defined(_MSC_VER))

#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wlanguage-extension-token"
#endif

/* intptr_t is used in cl.h and provided by stddef.h in Visual C++, but not in clang */
/* stdint.h was missing before Visual Studio 2010, include it for later versions and for clang */
#if defined(__clang__) || _MSC_VER >= 1600
    #include <stdint.h>
#endif

/* scalar types  */
typedef signed   __int8         cl_char;
typedef unsigned __int8         cl_uchar;
typedef signed   __int16        cl_short;
typedef unsigned __int16        cl_ushort;
typedef signed   __int32        cl_int;
typedef unsigned __int32        cl_uint;
typedef signed   __int64        cl_long;
typedef unsigned __int64        cl_ulong;

typedef unsigned __int16        cl_half;
typedef float                   cl_float;
typedef double                  cl_double;

#if defined(__clang__)
#pragma clang diagnostic pop
#endif

/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT         8
#define CL_SCHAR_MAX        127
#define CL_SCHAR_MIN        (-127-1)
#define CL_CHAR_MAX         CL_SCHAR_MAX
#define CL_CHAR_MIN         CL_SCHAR_MIN
#define CL_UCHAR_MAX        255
#define CL_SHRT_MAX         32767
#define CL_SHRT_MIN         (-32767-1)
#define CL_USHRT_MAX        65535
#define CL_INT_MAX          2147483647
#define CL_INT_MIN          (-2147483647-1)
#define CL_UINT_MAX         0xffffffffU
#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)

#define CL_FLT_DIG          6
#define CL_FLT_MANT_DIG     24
#define CL_FLT_MAX_10_EXP   +38
#define CL_FLT_MAX_EXP      +128
#define CL_FLT_MIN_10_EXP   -37
#define CL_FLT_MIN_EXP      -125
#define CL_FLT_RADIX        2
#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
#define CL_FLT_MIN          1.175494350822287507969e-38f
#define CL_FLT_EPSILON      1.1920928955078125e-7f

#define CL_HALF_DIG          3
#define CL_HALF_MANT_DIG     11
#define CL_HALF_MAX_10_EXP   +4
#define CL_HALF_MAX_EXP      +16
#define CL_HALF_MIN_10_EXP   -4
#define CL_HALF_MIN_EXP      -13
#define CL_HALF_RADIX        2
#define CL_HALF_MAX          65504.0f
#define CL_HALF_MIN          6.103515625e-05f
#define CL_HALF_EPSILON      9.765625e-04f

#define CL_DBL_DIG          15
#define CL_DBL_MANT_DIG     53
#define CL_DBL_MAX_10_EXP   +308
#define CL_DBL_MAX_EXP      +1024
#define CL_DBL_MIN_10_EXP   -307
#define CL_DBL_MIN_EXP      -1021
#define CL_DBL_RADIX        2
#define CL_DBL_MAX          1.7976931348623158e+308
#define CL_DBL_MIN          2.225073858507201383090e-308
#define CL_DBL_EPSILON      2.220446049250313080847e-16

#define CL_M_E              2.7182818284590452354
#define CL_M_LOG2E          1.4426950408889634074
#define CL_M_LOG10E         0.43429448190325182765
#define CL_M_LN2            0.69314718055994530942
#define CL_M_LN10           2.30258509299404568402
#define CL_M_PI             3.14159265358979323846
#define CL_M_PI_2           1.57079632679489661923
#define CL_M_PI_4           0.78539816339744830962
#define CL_M_1_PI           0.31830988618379067154
#define CL_M_2_PI           0.63661977236758134308
#define CL_M_2_SQRTPI       1.12837916709551257390
#define CL_M_SQRT2          1.41421356237309504880
#define CL_M_SQRT1_2        0.70710678118654752440

#define CL_M_E_F            2.718281828f
#define CL_M_LOG2E_F        1.442695041f
#define CL_M_LOG10E_F       0.434294482f
#define CL_M_LN2_F          0.693147181f
#define CL_M_LN10_F         2.302585093f
#define CL_M_PI_F           3.141592654f
#define CL_M_PI_2_F         1.570796327f
#define CL_M_PI_4_F         0.785398163f
#define CL_M_1_PI_F         0.318309886f
#define CL_M_2_PI_F         0.636619772f
#define CL_M_2_SQRTPI_F     1.128379167f
#define CL_M_SQRT2_F        1.414213562f
#define CL_M_SQRT1_2_F      0.707106781f

#define CL_NAN              (CL_INFINITY - CL_INFINITY)
#define CL_HUGE_VALF        ((cl_float) 1e50)
#define CL_HUGE_VAL         ((cl_double) 1e500)
#define CL_MAXFLOAT         CL_FLT_MAX
#define CL_INFINITY         CL_HUGE_VALF

#else

#include <stdint.h>

/* scalar types  */
typedef int8_t          cl_char;
typedef uint8_t         cl_uchar;
typedef int16_t         cl_short;
typedef uint16_t        cl_ushort;
typedef int32_t         cl_int;
typedef uint32_t        cl_uint;
typedef int64_t         cl_long;
typedef uint64_t        cl_ulong;

typedef uint16_t        cl_half;
typedef float           cl_float;
typedef double          cl_double;

/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT         8
#define CL_SCHAR_MAX        127
#define CL_SCHAR_MIN        (-127-1)
#define CL_CHAR_MAX         CL_SCHAR_MAX
#define CL_CHAR_MIN         CL_SCHAR_MIN
#define CL_UCHAR_MAX        255
#define CL_SHRT_MAX         32767
#define CL_SHRT_MIN         (-32767-1)
#define CL_USHRT_MAX        65535
#define CL_INT_MAX          2147483647
#define CL_INT_MIN          (-2147483647-1)
#define CL_UINT_MAX         0xffffffffU
#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)

#define CL_FLT_DIG          6
#define CL_FLT_MANT_DIG     24
#define CL_FLT_MAX_10_EXP   +38
#define CL_FLT_MAX_EXP      +128
#define CL_FLT_MIN_10_EXP   -37
#define CL_FLT_MIN_EXP      -125
#define CL_FLT_RADIX        2
#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
#define CL_FLT_MIN          1.175494350822287507969e-38f
#define CL_FLT_EPSILON      1.1920928955078125e-7f

#define CL_HALF_DIG          3
#define CL_HALF_MANT_DIG     11
#define CL_HALF_MAX_10_EXP   +4
#define CL_HALF_MAX_EXP      +16
#define CL_HALF_MIN_10_EXP   -4
#define CL_HALF_MIN_EXP      -13
#define CL_HALF_RADIX        2
#define CL_HALF_MAX          65504.0f
#define CL_HALF_MIN          6.103515625e-05f
#define CL_HALF_EPSILON      9.765625e-04f

#define CL_DBL_DIG          15
#define CL_DBL_MANT_DIG     53
#define CL_DBL_MAX_10_EXP   +308
#define CL_DBL_MAX_EXP      +1024
#define CL_DBL_MIN_10_EXP   -307
#define CL_DBL_MIN_EXP      -1021
#define CL_DBL_RADIX        2
#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
#define CL_DBL_MIN          2.225073858507201383090e-308
#define CL_DBL_EPSILON      2.220446049250313080847e-16

#define CL_M_E              2.7182818284590452354
#define CL_M_LOG2E          1.4426950408889634074
#define CL_M_LOG10E         0.43429448190325182765
#define CL_M_LN2            0.69314718055994530942
#define CL_M_LN10           2.30258509299404568402
#define CL_M_PI             3.14159265358979323846
#define CL_M_PI_2           1.57079632679489661923
#define CL_M_PI_4           0.78539816339744830962
#define CL_M_1_PI           0.31830988618379067154
#define CL_M_2_PI           0.63661977236758134308
#define CL_M_2_SQRTPI       1.12837916709551257390
#define CL_M_SQRT2          1.41421356237309504880
#define CL_M_SQRT1_2        0.70710678118654752440

#define CL_M_E_F            2.718281828f
#define CL_M_LOG2E_F        1.442695041f
#define CL_M_LOG10E_F       0.434294482f
#define CL_M_LN2_F          0.693147181f
#define CL_M_LN10_F         2.302585093f
#define CL_M_PI_F           3.141592654f
#define CL_M_PI_2_F         1.570796327f
#define CL_M_PI_4_F         0.785398163f
#define CL_M_1_PI_F         0.318309886f
#define CL_M_2_PI_F         0.636619772f
#define CL_M_2_SQRTPI_F     1.128379167f
#define CL_M_SQRT2_F        1.414213562f
#define CL_M_SQRT1_2_F      0.707106781f

#if defined( __GNUC__ )
   #define CL_HUGE_VALF     __builtin_huge_valf()
   #define CL_HUGE_VAL      __builtin_huge_val()
   #define CL_NAN           __builtin_nanf( "" )
#else
   #define CL_HUGE_VALF     ((cl_float) 1e50)
   #define CL_HUGE_VAL      ((cl_double) 1e500)
   float nanf( const char * );
   #define CL_NAN           nanf( "" )
#endif
#define CL_MAXFLOAT         CL_FLT_MAX
#define CL_INFINITY         CL_HUGE_VALF

#endif

#include <stddef.h>

/*
 * Vector types
 *
 *  Note:   OpenCL requires that all types be naturally aligned.
 *          This means that vector types must be naturally aligned.
 *          For example, a vector of four floats must be aligned to
 *          a 16 byte boundary (calculated as 4 * the natural 4-byte
 *          alignment of the float).  The alignment qualifiers here
 *          will only function properly if your compiler supports them
 *          and if you don't actively work to defeat them.  For example,
 *          in order for a cl_float4 to be 16 byte aligned in a struct,
 *          the start of the struct must itself be 16-byte aligned.
 *
 *          Maintaining proper alignment is the user's responsibility.
 */

/* Define basic vector types */
#if defined( __VEC__ )
  #if !defined(__clang__)
     #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
  #endif
   typedef __vector unsigned char     __cl_uchar16;
   typedef __vector signed char       __cl_char16;
   typedef __vector unsigned short    __cl_ushort8;
   typedef __vector signed short      __cl_short8;
   typedef __vector unsigned int      __cl_uint4;
   typedef __vector signed int        __cl_int4;
   typedef __vector float             __cl_float4;
   #define  __CL_UCHAR16__  1
   #define  __CL_CHAR16__   1
   #define  __CL_USHORT8__  1
   #define  __CL_SHORT8__   1
   #define  __CL_UINT4__    1
   #define  __CL_INT4__     1
   #define  __CL_FLOAT4__   1
#endif

#if defined( __SSE__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <xmmintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef float __cl_float4   __attribute__((vector_size(16)));
    #else
        typedef __m128 __cl_float4;
    #endif
    #define __CL_FLOAT4__   1
#endif

#if defined( __SSE2__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <emmintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
    #else
        typedef __m128i __cl_uchar16;
        typedef __m128i __cl_char16;
        typedef __m128i __cl_ushort8;
        typedef __m128i __cl_short8;
        typedef __m128i __cl_uint4;
        typedef __m128i __cl_int4;
        typedef __m128i __cl_ulong2;
        typedef __m128i __cl_long2;
        typedef __m128d __cl_double2;
    #endif
    #define __CL_UCHAR16__  1
    #define __CL_CHAR16__   1
    #define __CL_USHORT8__  1
    #define __CL_SHORT8__   1
    #define __CL_INT4__     1
    #define __CL_UINT4__    1
    #define __CL_ULONG2__   1
    #define __CL_LONG2__    1
    #define __CL_DOUBLE2__  1
#endif

#if defined( __MMX__ )
    #include <mmintrin.h>
    #if defined( __GNUC__ )
        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
    #else
        typedef __m64       __cl_uchar8;
        typedef __m64       __cl_char8;
        typedef __m64       __cl_ushort4;
        typedef __m64       __cl_short4;
        typedef __m64       __cl_uint2;
        typedef __m64       __cl_int2;
        typedef __m64       __cl_ulong1;
        typedef __m64       __cl_long1;
        typedef __m64       __cl_float2;
    #endif
    #define __CL_UCHAR8__   1
    #define __CL_CHAR8__    1
    #define __CL_USHORT4__  1
    #define __CL_SHORT4__   1
    #define __CL_INT2__     1
    #define __CL_UINT2__    1
    #define __CL_ULONG1__   1
    #define __CL_LONG1__    1
    #define __CL_FLOAT2__   1
#endif

#if defined( __AVX__ )
    #if defined( __MINGW64__ )
        #include <intrin.h>
    #else
        #include <immintrin.h>
    #endif
    #if defined( __GNUC__ )
        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
    #else
        typedef __m256      __cl_float8;
        typedef __m256d     __cl_double4;
    #endif
    #define __CL_FLOAT8__   1
    #define __CL_DOUBLE4__  1
#endif

/* Define capabilities for anonymous struct members. */
#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__
#elif defined(_WIN32) && defined(_MSC_VER) && !defined(__STDC__)
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__
#elif defined(__GNUC__) && ! defined(__STRICT_ANSI__)
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__ __extension__
#elif defined(__clang__)
#define  __CL_HAS_ANON_STRUCT__ 1
#define  __CL_ANON_STRUCT__ __extension__
#else
#define  __CL_HAS_ANON_STRUCT__ 0
#define  __CL_ANON_STRUCT__
#endif

#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
   /* Disable warning C4201: nonstandard extension used : nameless struct/union */
    #pragma warning( push )
    #pragma warning( disable : 4201 )
#endif

/* Define alignment keys */
#if defined( __GNUC__ ) || defined(__INTEGRITY)
    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
#elif defined( _WIN32) && (_MSC_VER)
    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
    /* #include <crtdefs.h>                                                                                             */
    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
    #define CL_ALIGNED(_x)
#else
   #warning  Need to implement some method to align data here
   #define  CL_ALIGNED(_x)
#endif

/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
#if __CL_HAS_ANON_STRUCT__
    /* .xyzw and .s0123...{f|F} are supported */
    #define CL_HAS_NAMED_VECTOR_FIELDS 1
    /* .hi and .lo are supported */
    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
#endif

/* Define cl_vector types */

/* ---- cl_charn ---- */
typedef union
{
    cl_char  CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2;
#endif
}cl_char2;

typedef union
{
    cl_char  CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[2];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4;
#endif
}cl_char4;

/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
typedef  cl_char4  cl_char3;

typedef union
{
    cl_char   CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[4];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4[2];
#endif
#if defined( __CL_CHAR8__ )
    __cl_char8     v8;
#endif
}cl_char8;

typedef union
{
    cl_char  CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
    __cl_char2     v2[8];
#endif
#if defined( __CL_CHAR4__)
    __cl_char4     v4[4];
#endif
#if defined( __CL_CHAR8__ )
    __cl_char8     v8[2];
#endif
#if defined( __CL_CHAR16__ )
    __cl_char16    v16;
#endif
}cl_char16;


/* ---- cl_ucharn ---- */
typedef union
{
    cl_uchar  CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
#endif
#if defined( __cl_uchar2__)
    __cl_uchar2     v2;
#endif
}cl_uchar2;

typedef union
{
    cl_uchar  CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[2];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4;
#endif
}cl_uchar4;

/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
typedef  cl_uchar4  cl_uchar3;

typedef union
{
    cl_uchar   CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[4];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4[2];
#endif
#if defined( __CL_UCHAR8__ )
    __cl_uchar8     v8;
#endif
}cl_uchar8;

typedef union
{
    cl_uchar  CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
    __cl_uchar2     v2[8];
#endif
#if defined( __CL_UCHAR4__)
    __cl_uchar4     v4[4];
#endif
#if defined( __CL_UCHAR8__ )
    __cl_uchar8     v8[2];
#endif
#if defined( __CL_UCHAR16__ )
    __cl_uchar16    v16;
#endif
}cl_uchar16;


/* ---- cl_shortn ---- */
typedef union
{
    cl_short  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2;
#endif
}cl_short2;

typedef union
{
    cl_short  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[2];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4;
#endif
}cl_short4;

/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
typedef  cl_short4  cl_short3;

typedef union
{
    cl_short   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[4];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4[2];
#endif
#if defined( __CL_SHORT8__ )
    __cl_short8     v8;
#endif
}cl_short8;

typedef union
{
    cl_short  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
    __cl_short2     v2[8];
#endif
#if defined( __CL_SHORT4__)
    __cl_short4     v4[4];
#endif
#if defined( __CL_SHORT8__ )
    __cl_short8     v8[2];
#endif
#if defined( __CL_SHORT16__ )
    __cl_short16    v16;
#endif
}cl_short16;


/* ---- cl_ushortn ---- */
typedef union
{
    cl_ushort  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2;
#endif
}cl_ushort2;

typedef union
{
    cl_ushort  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[2];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4;
#endif
}cl_ushort4;

/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
typedef  cl_ushort4  cl_ushort3;

typedef union
{
    cl_ushort   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[4];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4[2];
#endif
#if defined( __CL_USHORT8__ )
    __cl_ushort8     v8;
#endif
}cl_ushort8;

typedef union
{
    cl_ushort  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
    __cl_ushort2     v2[8];
#endif
#if defined( __CL_USHORT4__)
    __cl_ushort4     v4[4];
#endif
#if defined( __CL_USHORT8__ )
    __cl_ushort8     v8[2];
#endif
#if defined( __CL_USHORT16__ )
    __cl_ushort16    v16;
#endif
}cl_ushort16;


/* ---- cl_halfn ---- */
typedef union
{
    cl_half  CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2;
#endif
}cl_half2;

typedef union
{
    cl_half  CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[2];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4;
#endif
}cl_half4;

/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
typedef  cl_half4  cl_half3;

typedef union
{
    cl_half   CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[4];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4[2];
#endif
#if defined( __CL_HALF8__ )
    __cl_half8     v8;
#endif
}cl_half8;

typedef union
{
    cl_half  CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
#endif
#if defined( __CL_HALF2__)
    __cl_half2     v2[8];
#endif
#if defined( __CL_HALF4__)
    __cl_half4     v4[4];
#endif
#if defined( __CL_HALF8__ )
    __cl_half8     v8[2];
#endif
#if defined( __CL_HALF16__ )
    __cl_half16    v16;
#endif
}cl_half16;

/* ---- cl_intn ---- */
typedef union
{
    cl_int  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2;
#endif
}cl_int2;

typedef union
{
    cl_int  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[2];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4;
#endif
}cl_int4;

/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
typedef  cl_int4  cl_int3;

typedef union
{
    cl_int   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[4];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4[2];
#endif
#if defined( __CL_INT8__ )
    __cl_int8     v8;
#endif
}cl_int8;

typedef union
{
    cl_int  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
#endif
#if defined( __CL_INT2__)
    __cl_int2     v2[8];
#endif
#if defined( __CL_INT4__)
    __cl_int4     v4[4];
#endif
#if defined( __CL_INT8__ )
    __cl_int8     v8[2];
#endif
#if defined( __CL_INT16__ )
    __cl_int16    v16;
#endif
}cl_int16;


/* ---- cl_uintn ---- */
typedef union
{
    cl_uint  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2;
#endif
}cl_uint2;

typedef union
{
    cl_uint  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[2];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4;
#endif
}cl_uint4;

/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
typedef  cl_uint4  cl_uint3;

typedef union
{
    cl_uint   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[4];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4[2];
#endif
#if defined( __CL_UINT8__ )
    __cl_uint8     v8;
#endif
}cl_uint8;

typedef union
{
    cl_uint  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
#endif
#if defined( __CL_UINT2__)
    __cl_uint2     v2[8];
#endif
#if defined( __CL_UINT4__)
    __cl_uint4     v4[4];
#endif
#if defined( __CL_UINT8__ )
    __cl_uint8     v8[2];
#endif
#if defined( __CL_UINT16__ )
    __cl_uint16    v16;
#endif
}cl_uint16;

/* ---- cl_longn ---- */
typedef union
{
    cl_long  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2;
#endif
}cl_long2;

typedef union
{
    cl_long  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[2];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4;
#endif
}cl_long4;

/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
typedef  cl_long4  cl_long3;

typedef union
{
    cl_long   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[4];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4[2];
#endif
#if defined( __CL_LONG8__ )
    __cl_long8     v8;
#endif
}cl_long8;

typedef union
{
    cl_long  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
#endif
#if defined( __CL_LONG2__)
    __cl_long2     v2[8];
#endif
#if defined( __CL_LONG4__)
    __cl_long4     v4[4];
#endif
#if defined( __CL_LONG8__ )
    __cl_long8     v8[2];
#endif
#if defined( __CL_LONG16__ )
    __cl_long16    v16;
#endif
}cl_long16;


/* ---- cl_ulongn ---- */
typedef union
{
    cl_ulong  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2;
#endif
}cl_ulong2;

typedef union
{
    cl_ulong  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[2];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4;
#endif
}cl_ulong4;

/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
typedef  cl_ulong4  cl_ulong3;

typedef union
{
    cl_ulong   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[4];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4[2];
#endif
#if defined( __CL_ULONG8__ )
    __cl_ulong8     v8;
#endif
}cl_ulong8;

typedef union
{
    cl_ulong  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
    __cl_ulong2     v2[8];
#endif
#if defined( __CL_ULONG4__)
    __cl_ulong4     v4[4];
#endif
#if defined( __CL_ULONG8__ )
    __cl_ulong8     v8[2];
#endif
#if defined( __CL_ULONG16__ )
    __cl_ulong16    v16;
#endif
}cl_ulong16;


/* --- cl_floatn ---- */

typedef union
{
    cl_float  CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2;
#endif
}cl_float2;

typedef union
{
    cl_float  CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[2];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4;
#endif
}cl_float4;

/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
typedef  cl_float4  cl_float3;

typedef union
{
    cl_float   CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[4];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4[2];
#endif
#if defined( __CL_FLOAT8__ )
    __cl_float8     v8;
#endif
}cl_float8;

typedef union
{
    cl_float  CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
    __cl_float2     v2[8];
#endif
#if defined( __CL_FLOAT4__)
    __cl_float4     v4[4];
#endif
#if defined( __CL_FLOAT8__ )
    __cl_float8     v8[2];
#endif
#if defined( __CL_FLOAT16__ )
    __cl_float16    v16;
#endif
}cl_float16;

/* --- cl_doublen ---- */

typedef union
{
    cl_double  CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2;
#endif
}cl_double2;

typedef union
{
    cl_double  CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[2];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4;
#endif
}cl_double4;

/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
typedef  cl_double4  cl_double3;

typedef union
{
    cl_double   CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[4];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4[2];
#endif
#if defined( __CL_DOUBLE8__ )
    __cl_double8     v8;
#endif
}cl_double8;

typedef union
{
    cl_double  CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
    __cl_double2     v2[8];
#endif
#if defined( __CL_DOUBLE4__)
    __cl_double4     v4[4];
#endif
#if defined( __CL_DOUBLE8__ )
    __cl_double8     v8[2];
#endif
#if defined( __CL_DOUBLE16__ )
    __cl_double16    v16;
#endif
}cl_double16;

/* Macro to facilitate debugging
 * Usage:
 *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
 *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
 *   Each line thereafter of OpenCL C source must end with: \n\
 *   The last line ends in ";
 *
 *   Example:
 *
 *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
 *   kernel void foo( int a, float * b )             \n\
 *   {                                               \n\
 *      // my comment                                \n\
 *      *b[ get_global_id(0)] = a;                   \n\
 *   }                                               \n\
 *   ";
 *
 * This should correctly set up the line, (column) and file information for your source
 * string so you can do source level debugging.
 */
#define  __CL_STRINGIFY( _x )               # _x
#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"

#ifdef __cplusplus
}
#endif

#if defined(_WIN32) && defined(_MSC_VER) && __CL_HAS_ANON_STRUCT__
    #pragma warning( pop )
#endif

#endif  /* __CL_PLATFORM_H  */


================================================
FILE: svm/OpenCL/include/CL/cl_va_api_media_sharing_intel.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2023 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_
#define OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_

/*
** This header is generated from the Khronos OpenCL XML API Registry.
*/

#include <va/va.h>

#include <CL/cl.h>

/* CL_NO_PROTOTYPES implies CL_NO_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_PROTOTYPES) && !defined(CL_NO_EXTENSION_PROTOTYPES)
#define CL_NO_EXTENSION_PROTOTYPES
#endif

/* CL_NO_EXTENSION_PROTOTYPES implies
   CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES and
   CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES: */
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif
#if defined(CL_NO_EXTENSION_PROTOTYPES) && \
    !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
#define CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES
#endif

#ifdef __cplusplus
extern "C" {
#endif

/***************************************************************
* cl_intel_sharing_format_query_va_api
***************************************************************/
#define cl_intel_sharing_format_query_va_api 1
#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_NAME \
    "cl_intel_sharing_format_query_va_api"


#define CL_INTEL_SHARING_FORMAT_QUERY_VA_API_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

/* when cl_intel_va_api_media_sharing is supported */

typedef cl_int CL_API_CALL
clGetSupportedVA_APIMediaSurfaceFormatsINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    VAImageFormat* va_api_formats,
    cl_uint* num_surface_formats);

typedef clGetSupportedVA_APIMediaSurfaceFormatsINTEL_t *
clGetSupportedVA_APIMediaSurfaceFormatsINTEL_fn ;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedVA_APIMediaSurfaceFormatsINTEL(
    cl_context context,
    cl_mem_flags flags,
    cl_mem_object_type image_type,
    cl_uint plane,
    cl_uint num_entries,
    VAImageFormat* va_api_formats,
    cl_uint* num_surface_formats) ;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

/***************************************************************
* cl_intel_va_api_media_sharing
***************************************************************/
#define cl_intel_va_api_media_sharing 1
#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_NAME \
    "cl_intel_va_api_media_sharing"


#define CL_INTEL_VA_API_MEDIA_SHARING_EXTENSION_VERSION CL_MAKE_VERSION(0, 0, 0)

typedef cl_uint             cl_va_api_device_source_intel;
typedef cl_uint             cl_va_api_device_set_intel;

/* Error codes */
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101

/* cl_va_api_device_source_intel */
#define CL_VA_API_DISPLAY_INTEL                             0x4094

/* cl_va_api_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096

/* cl_context_info */
#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097

/* cl_mem_info */
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098

/* cl_image_info */
#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099

/* cl_command_type */
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B


typedef cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL_t(
    cl_platform_id platform,
    cl_va_api_device_source_intel media_adapter_type,
    void* media_adapter,
    cl_va_api_device_set_intel media_adapter_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices);

typedef clGetDeviceIDsFromVA_APIMediaAdapterINTEL_t *
clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL_t(
    cl_context context,
    cl_mem_flags flags,
    VASurfaceID* surface,
    cl_uint plane,
    cl_int* errcode_ret);

typedef clCreateFromVA_APIMediaSurfaceINTEL_t *
clCreateFromVA_APIMediaSurfaceINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueAcquireVA_APIMediaSurfacesINTEL_t *
clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn CL_API_SUFFIX__VERSION_1_2;

typedef cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL_t(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event);

typedef clEnqueueReleaseVA_APIMediaSurfacesINTEL_t *
clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn CL_API_SUFFIX__VERSION_1_2;

#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)

extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
    cl_platform_id platform,
    cl_va_api_device_source_intel media_adapter_type,
    void* media_adapter,
    cl_va_api_device_set_intel media_adapter_set,
    cl_uint num_entries,
    cl_device_id* devices,
    cl_uint* num_devices) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL(
    cl_context context,
    cl_mem_flags flags,
    VASurfaceID* surface,
    cl_uint plane,
    cl_int* errcode_ret) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
    cl_command_queue command_queue,
    cl_uint num_objects,
    const cl_mem* mem_objects,
    cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list,
    cl_event* event) CL_API_SUFFIX__VERSION_1_2;

#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */

#ifdef __cplusplus
}
#endif

#endif /* OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H_ */


================================================
FILE: svm/OpenCL/include/CL/cl_version.h
================================================
/*******************************************************************************
 * Copyright (c) 2018-2020 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __CL_VERSION_H
#define __CL_VERSION_H

/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 300 (OpenCL 3.0)")
#define CL_TARGET_OPENCL_VERSION 300
#endif
#if CL_TARGET_OPENCL_VERSION != 100 && \
    CL_TARGET_OPENCL_VERSION != 110 && \
    CL_TARGET_OPENCL_VERSION != 120 && \
    CL_TARGET_OPENCL_VERSION != 200 && \
    CL_TARGET_OPENCL_VERSION != 210 && \
    CL_TARGET_OPENCL_VERSION != 220 && \
    CL_TARGET_OPENCL_VERSION != 300
#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 300 (OpenCL 3.0)")
#undef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 300
#endif


/* OpenCL Version */
#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
#define CL_VERSION_3_0  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
#define CL_VERSION_2_2  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
#define CL_VERSION_2_1  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
#define CL_VERSION_2_0  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
#define CL_VERSION_1_2  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
#define CL_VERSION_1_1  1
#endif
#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
#define CL_VERSION_1_0  1
#endif

/* Allow deprecated APIs for older OpenCL versions. */
#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif

#endif  /* __CL_VERSION_H */


================================================
FILE: svm/OpenCL/include/CL/opencl.h
================================================
/*******************************************************************************
 * Copyright (c) 2008-2021 The Khronos Group Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/

#ifndef __OPENCL_H
#define __OPENCL_H

#ifdef __cplusplus
extern "C" {
#endif

#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_ext.h>

#ifdef __cplusplus
}
#endif

#endif  /* __OPENCL_H   */


================================================
FILE: svm/OpenCL/include/CL/opencl.hpp
================================================
//
// Copyright (c) 2008-2024 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

/*! \file
 *
 *   \brief C++ bindings for OpenCL 1.0, OpenCL 1.1, OpenCL 1.2,
 *       OpenCL 2.0, OpenCL 2.1, OpenCL 2.2, and OpenCL 3.0.
 *   \author Lee Howes and Bruce Merry
 *
 *   Derived from the OpenCL 1.x C++ bindings written by
 *   Benedict R. Gaster, Laurent Morichetti and Lee Howes
 *   With additions and fixes from:
 *       Brian Cole, March 3rd 2010 and April 2012
 *       Matt Gruenke, April 2012.
 *       Bruce Merry, February 2013.
 *       Tom Deakin and Simon McIntosh-Smith, July 2013
 *       James Price, 2015-
 *   \version 2.2.0
 *   \date 2019-09-18
 *
 *   Optional extension support
 *
 *         cl_khr_d3d10_sharing
 *         #define CL_HPP_USE_DX_INTEROP
 *         cl_khr_il_program
 *         #define CL_HPP_USE_IL_KHR
 *         cl_khr_sub_groups
 *         #define CL_HPP_USE_CL_SUB_GROUPS_KHR
 *
 *   Doxygen documentation for this header is available here:
 *
 *       http://khronosgroup.github.io/OpenCL-CLHPP/
 *
 *   The latest version of this header can be found on the GitHub releases page:
 *
 *       https://github.com/KhronosGroup/OpenCL-CLHPP/releases
 *
 *   Bugs and patches can be submitted to the GitHub repository:
 *
 *       https://github.com/KhronosGroup/OpenCL-CLHPP
 */

/*! \mainpage
 * \section intro Introduction
 * For many large applications C++ is the language of choice and so it seems
 * reasonable to define C++ bindings for OpenCL.
 *
 * The interface is contained with a single C++ header file \em opencl.hpp and all
 * definitions are contained within the namespace \em cl. There is no additional
 * requirement to include \em cl.h and to use either the C++ or original C
 * bindings; it is enough to simply include \em opencl.hpp.
 *
 * The bindings themselves are lightweight and correspond closely to the
 * underlying C API. Using the C++ bindings introduces no additional execution
 * overhead.
 *
 * There are numerous compatibility, portability and memory management
 * fixes in the new header as well as additional OpenCL 2.0 features.
 * As a result the header is not directly backward compatible and for this
 * reason we release it as opencl.hpp rather than a new version of cl.hpp.
 * 
 *
 * \section compatibility Compatibility
 * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings
 * include an updated approach to defining supported feature versions
 * and the range of valid underlying OpenCL runtime versions supported.
 *
 * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and 
 * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit
 * decimal values representing OpenCL runtime versions. The default for 
 * the target is 300, representing OpenCL 3.0.  The minimum is defined as 200.
 * These settings would use 2.0 and newer API calls only.
 * If backward compatibility with a 1.2 runtime is required, the minimum
 * version may be set to 120.
 *
 * Note that this is a compile-time setting, and so affects linking against
 * a particular SDK version rather than the versioning of the loaded runtime.
 *
 * The earlier versions of the header included basic vector and string 
 * classes based loosely on STL versions. These were difficult to 
 * maintain and very rarely used. For the 2.0 header we now assume
 * the presence of the standard library unless requested otherwise.
 * We use std::array, std::vector, std::shared_ptr and std::string 
 * throughout to safely manage memory and reduce the chance of a 
 * recurrance of earlier memory management bugs.
 *
 * These classes are used through typedefs in the cl namespace: 
 * cl::array, cl::vector, cl::pointer and cl::string.
 * In addition cl::allocate_pointer forwards to std::allocate_shared
 * by default.
 * In all cases these standard library classes can be replaced with 
 * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, 
 * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and 
 * CL_HPP_NO_STD_STRING macros.
 *
 * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper
 * class to interface with kernel enqueue. This caused unpleasant interactions
 * with the standard size_t declaration and led to namespacing bugs.
 * In the 2.0 version we have replaced this with a std::array-based interface.
 * However, the old behaviour can be regained for backward compatibility
 * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro.
 *
 * Finally, the program construction interface used a clumsy vector-of-pairs
 * design in the earlier versions. We have replaced that with a cleaner 
 * vector-of-vectors and vector-of-strings design. However, for backward 
 * compatibility old behaviour can be regained with the
 * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro.
 * 
 * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with 
 * earlier versions. As a result a flag must be passed to the OpenCL C
 * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as
 * the default in the absence of the flag.
 * In some cases the C++ bindings automatically compile code for ease.
 * For those cases the compilation defaults to OpenCL C 2.0.
 * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may
 * be specified to assume 1.2 compilation.
 * If more fine-grained decisions on a per-kernel bases are required
 * then explicit build operations that take the flag should be used.
 *
 *
 * \section parameterization Parameters
 * This header may be parameterized by a set of preprocessor macros.
 *
 * - CL_HPP_TARGET_OPENCL_VERSION
 *
 *   Defines the target OpenCL runtime version to build the header
 *   against. Defaults to 300, representing OpenCL 3.0.
 *
 * - CL_HPP_MINIMUM_OPENCL_VERSION
 *
 *   Defines the minimum OpenCL runtime version to build the header
 *   against. Defaults to 200, representing OpenCL 2.0.
 *
 * - CL_HPP_NO_STD_STRING
 *
 *   Do not use the standard library string class. cl::string is not
 *   defined and may be defined by the user before opencl.hpp is
 *   included.
 *
 * - CL_HPP_NO_STD_VECTOR
 *
 *   Do not use the standard library vector class. cl::vector is not
 *   defined and may be defined by the user before opencl.hpp is
 *   included.
 *
 * - CL_HPP_NO_STD_ARRAY
 *
 *   Do not use the standard library array class. cl::array is not
 *   defined and may be defined by the user before opencl.hpp is
 *   included.
 *
 * - CL_HPP_NO_STD_UNIQUE_PTR
 *
 *   Do not use the standard library unique_ptr class. cl::pointer and
 *   the cl::allocate_pointer functions are not defined and may be
 *   defined by the user before opencl.hpp is included.
 *
 * - CL_HPP_ENABLE_EXCEPTIONS
 *
 *   Enable exceptions for use in the C++ bindings header. This is the
 *   preferred error handling mechanism but is not required.
 *
 * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY
 *
 *   Backward compatibility option to support cl.hpp-style size_t
 *   class.  Replaces the updated std::array derived version and
 *   removal of size_t from the namespace. Note that in this case the
 *   new size_t class is placed in the cl::compatibility namespace and
 *   thus requires an additional using declaration for direct backward
 *   compatibility.
 *
 * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY
 *
 *   Enable older vector of pairs interface for construction of
 *   programs.
 *
 * - CL_HPP_CL_1_2_DEFAULT_BUILD
 *
 *   Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0
 *   applies to use of cl::Program construction and other program
 *   build variants.
 *
 *
 * - CL_HPP_USE_CL_SUB_GROUPS_KHR
 *
 *   Enable the cl_khr_subgroups extension.
 *
 * - CL_HPP_USE_DX_INTEROP
 *
 *   Enable the cl_khr_d3d10_sharing extension.
 *
 * - CL_HPP_USE_IL_KHR
 *
 *   Enable the cl_khr_il_program extension.
 *
 *
 * \section example Example
 *
 * The following example shows a general use case for the C++
 * bindings, including support for the optional exception feature and
 * also the supplied vector and string classes, see following sections for
 * decriptions of these features.
 * 
 * Note: the C++ bindings use std::call_once and therefore may need to be
 * compiled using special command-line options (such as "-pthread") on some
 * platforms!
 *
 * \code
    #define CL_HPP_ENABLE_EXCEPTIONS
    #define CL_HPP_TARGET_OPENCL_VERSION 200

    #include <CL/opencl.hpp>
    #include <iostream>
    #include <vector>
    #include <memory>
    #include <algorithm>

    const int numElements = 32;

    int main(void)
    {
        // Filter for a 2.0 or newer platform and set it as the default
        std::vector<cl::Platform> platforms;
        cl::Platform::get(&platforms);
        cl::Platform plat;
        for (auto &p : platforms) {
            std::string platver = p.getInfo<CL_PLATFORM_VERSION>();
            if (platver.find("OpenCL 2.") != std::string::npos ||
                platver.find("OpenCL 3.") != std::string::npos) {
                // Note: an OpenCL 3.x platform may not support all required features!
                plat = p;
            }
        }
        if (plat() == 0) {
            std::cout << "No OpenCL 2.0 or newer platform found.\n";
            return -1;
        }

        cl::Platform newP = cl::Platform::setDefault(plat);
        if (newP != plat) {
            std::cout << "Error setting default platform.\n";
            return -1;
        }

        // C++11 raw string literal for the first kernel
        std::string kernel1{R"CLC(
            global int globalA;
            kernel void updateGlobal()
            {
              globalA = 75;
            }
        )CLC"};

        // Raw string literal for the second kernel
        std::string kernel2{R"CLC(
            typedef struct { global int *bar; } Foo;
            kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB,
                                  global int *output, int val, write_only pipe int outPipe, queue_t childQueue)
            {
              output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar);
              write_pipe(outPipe, &val);
              queue_t default_queue = get_default_queue();
              ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2);

              // Have a child kernel write into third quarter of output
              enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
                ^{
                    output[get_global_size(0)*2 + get_global_id(0)] =
                      inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA;
                });

              // Have a child kernel write into last quarter of output
              enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange,
                ^{
                    output[get_global_size(0)*3 + get_global_id(0)] =
                      inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2;
                });
            }
        )CLC"};

        std::vector<std::string> programStrings;
        programStrings.push_back(kernel1);
        programStrings.push_back(kernel2);

        cl::Program vectorAddProgram(programStrings);
        try {
            vectorAddProgram.build("-cl-std=CL2.0");
        }
        catch (...) {
            // Print build info for all devices
            cl_int buildErr = CL_SUCCESS;
            auto buildInfo = vectorAddProgram.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
            for (auto &pair : buildInfo) {
                std::cerr << pair.second << std::endl << std::endl;
            }

            return 1;
        }

        typedef struct { int *bar; } Foo;

        // Get and run kernel that initializes the program-scope global
        // A test for kernels that take no arguments
        auto program2Kernel =
            cl::KernelFunctor<>(vectorAddProgram, "updateGlobal");
        program2Kernel(
            cl::EnqueueArgs(
            cl::NDRange(1)));

        //////////////////
        // SVM allocations

        auto anSVMInt = cl::allocate_svm<int, cl::SVMTraitCoarse<>>();
        *anSVMInt = 5;
        cl::SVMAllocator<Foo, cl::SVMTraitCoarse<cl::SVMTraitReadOnly<>>> svmAllocReadOnly;
        auto fooPointer = cl::allocate_pointer<Foo>(svmAllocReadOnly);
        fooPointer->bar = anSVMInt.get();
        cl::SVMAllocator<int, cl::SVMTraitCoarse<>> svmAlloc;
        std::vector<int, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>> inputA(numElements, 1, svmAlloc);
        cl::coarse_svm_vector<int> inputB(numElements, 2, svmAlloc);

        //////////////
        // Traditional cl_mem allocations

        std::vector<int> output(numElements, 0xdeadbeef);
        cl::Buffer outputBuffer(output.begin(), output.end(), false);
        cl::Pipe aPipe(sizeof(cl_int), numElements / 2);

        // Default command queue, also passed in as a parameter
        cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault(
            cl::Context::getDefault(), cl::Device::getDefault());

        auto vectorAddKernel =
            cl::KernelFunctor<
                decltype(fooPointer)&,
                int*,
                cl::coarse_svm_vector<int>&,
                cl::Buffer,
                int,
                cl::Pipe&,
                cl::DeviceCommandQueue
                >(vectorAddProgram, "vectorAdd");

        // Ensure that the additional SVM pointer is available to the kernel
        // This one was not passed as a parameter
        vectorAddKernel.setSVMPointers(anSVMInt);

        cl_int error;
        vectorAddKernel(
            cl::EnqueueArgs(
                cl::NDRange(numElements/2),
                cl::NDRange(numElements/2)),
            fooPointer,
            inputA.data(),
            inputB,
            outputBuffer,
            3,
            aPipe,
            defaultDeviceQueue,
            error
            );

        cl::copy(outputBuffer, output.begin(), output.end());

        cl::Device d = cl::Device::getDefault();

        std::cout << "Output:\n";
        for (int i = 1; i < numElements; ++i) {
            std::cout << "\t" << output[i] << "\n";
        }
        std::cout << "\n\n";

        return 0;
    }
 *
 * \endcode
 *
 */
#ifndef CL_HPP_
#define CL_HPP_

/* Handle deprecated preprocessor definitions. In each case, we only check for
 * the old name if the new name is not defined, so that user code can define
 * both and hence work with either version of the bindings.
 */
#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP)
# pragma message("opencl.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead")
# define CL_HPP_USE_DX_INTEROP
#endif
#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS)
# pragma message("opencl.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead")
# define CL_HPP_ENABLE_EXCEPTIONS
#endif
#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR)
# pragma message("opencl.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead")
# define CL_HPP_NO_STD_VECTOR
#endif
#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING)
# pragma message("opencl.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead")
# define CL_HPP_NO_STD_STRING
#endif
#if defined(VECTOR_CLASS)
# pragma message("opencl.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead")
#endif
#if defined(STRING_CLASS)
# pragma message("opencl.hpp: STRING_CLASS is deprecated. Alias cl::string instead.")
#endif
#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
# pragma message("opencl.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead")
# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS
#endif

/* Warn about features that are no longer supported
 */
#if defined(__USE_DEV_VECTOR)
# pragma message("opencl.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors")
#endif
#if defined(__USE_DEV_STRING)
# pragma message("opencl.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors")
#endif

/* Detect which version to target */
#if !defined(CL_HPP_TARGET_OPENCL_VERSION)
# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 300 (OpenCL 3.0)")
# define CL_HPP_TARGET_OPENCL_VERSION 300
#endif
#if CL_HPP_TARGET_OPENCL_VERSION != 100 && \
    CL_HPP_TARGET_OPENCL_VERSION != 110 && \
    CL_HPP_TARGET_OPENCL_VERSION != 120 && \
    CL_HPP_TARGET_OPENCL_VERSION != 200 && \
    CL_HPP_TARGET_OPENCL_VERSION != 210 && \
    CL_HPP_TARGET_OPENCL_VERSION != 220 && \
    CL_HPP_TARGET_OPENCL_VERSION != 300
# pragma message("opencl.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 300 (OpenCL 3.0).")
# undef CL_HPP_TARGET_OPENCL_VERSION
# define CL_HPP_TARGET_OPENCL_VERSION 300
#endif

/* Forward target OpenCL version to C headers if necessary */
#if defined(CL_TARGET_OPENCL_VERSION)
/* Warn if prior definition of CL_TARGET_OPENCL_VERSION is lower than
 * requested C++ bindings version */
#if CL_TARGET_OPENCL_VERSION < CL_HPP_TARGET_OPENCL_VERSION
# pragma message("CL_TARGET_OPENCL_VERSION is already defined as is lower than CL_HPP_TARGET_OPENCL_VERSION")
#endif
#else
# define CL_TARGET_OPENCL_VERSION CL_HPP_TARGET_OPENCL_VERSION
#endif

#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION)
# define CL_HPP_MINIMUM_OPENCL_VERSION 200
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 110 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 120 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 200 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 210 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 220 && \
    CL_HPP_MINIMUM_OPENCL_VERSION != 300
# pragma message("opencl.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220 or 300). It will be set to 100")
# undef CL_HPP_MINIMUM_OPENCL_VERSION
# define CL_HPP_MINIMUM_OPENCL_VERSION 100
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION
# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION"
#endif

#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
# define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
# define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
# define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
# define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
# define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
# define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#endif

#ifdef _WIN32

#include <malloc.h>

#if defined(CL_HPP_USE_DX_INTEROP)
#include <CL/cl_d3d10.h>
#include <CL/cl_dx9_media_sharing.h>
#endif
#endif // _WIN32

#if defined(_MSC_VER)
#include <intrin.h>
#endif // _MSC_VER 
 
 // Check for a valid C++ version

// Need to do both tests here because for some reason __cplusplus is not 
// updated in visual studio
#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700)
#error Visual studio 2013 or another C++11-supporting compiler required
#endif

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif // !__APPLE__

#if __cplusplus >= 201703L
# define CL_HPP_DEFINE_STATIC_MEMBER_ inline
#elif defined(_MSC_VER)
# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
#elif defined(__MINGW32__)
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((selectany))
#else
# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
#endif // !_MSC_VER

// Define deprecated prefixes and suffixes to ensure compilation
// in case they are not pre-defined
#if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
#define CL_API_PREFIX__VERSION_1_1_DEPRECATED
#endif // #if !defined(CL_API_PREFIX__VERSION_1_1_DEPRECATED)
#if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)
#define CL_API_SUFFIX__VERSION_1_1_DEPRECATED
#endif // #if !defined(CL_API_SUFFIX__VERSION_1_1_DEPRECATED)

#if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
#define CL_API_PREFIX__VERSION_1_2_DEPRECATED
#endif // #if !defined(CL_API_PREFIX__VERSION_1_2_DEPRECATED)
#if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)
#define CL_API_SUFFIX__VERSION_1_2_DEPRECATED
#endif // #if !defined(CL_API_SUFFIX__VERSION_1_2_DEPRECATED)

#if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
#define CL_API_PREFIX__VERSION_2_2_DEPRECATED
#endif // #if !defined(CL_API_PREFIX__VERSION_2_2_DEPRECATED)
#if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)
#define CL_API_SUFFIX__VERSION_2_2_DEPRECATED
#endif // #if !defined(CL_API_SUFFIX__VERSION_2_2_DEPRECATED)

#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif //CL_CALLBACK

#include <utility>
#include <limits>
#include <iterator>
#include <mutex>
#include <cstring>
#include <functional>


// Define a size_type to represent a correctly resolved size_t
#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
namespace cl {
    using size_type = ::size_t;
} // namespace cl
#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
namespace cl {
    using size_type = size_t;
} // namespace cl
#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)


#if defined(CL_HPP_ENABLE_EXCEPTIONS)
#include <exception>
#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)

#if !defined(CL_HPP_NO_STD_VECTOR)
#include <vector>
namespace cl {
    template < class T, class Alloc = std::allocator<T> >
    using vector = std::vector<T, Alloc>;
} // namespace cl
#endif // #if !defined(CL_HPP_NO_STD_VECTOR)

#if !defined(CL_HPP_NO_STD_STRING)
#include <string>
namespace cl {
    using string = std::string;
} // namespace cl
#endif // #if !defined(CL_HPP_NO_STD_STRING)

#if CL_HPP_TARGET_OPENCL_VERSION >= 200

#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
#include <memory>
namespace cl {
    // Replace unique_ptr and allocate_pointer for internal use
    // to allow user to replace them
    template<class T, class D>
    using pointer = std::unique_ptr<T, D>;
} // namespace cl
#endif 
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200
#if !defined(CL_HPP_NO_STD_ARRAY)
#include <array>
namespace cl {
    template < class T, size_type N >
    using array = std::array<T, N>;
} // namespace cl
#endif // #if !defined(CL_HPP_NO_STD_ARRAY)

// Define size_type appropriately to allow backward-compatibility
// use of the old size_t interface class
#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)
namespace cl {
    namespace compatibility {
        /*! \brief class used to interface between C++ and
        *  OpenCL C calls that require arrays of size_t values, whose
        *  size is known statically.
        */
        template <int N>
        class size_t
        {
        private:
            size_type data_[N];

        public:
            //! \brief Initialize size_t to all 0s
            size_t()
            {
                for (int i = 0; i < N; ++i) {
                    data_[i] = 0;
                }
            }

            size_t(const array<size_type, N> &rhs)
            {
                for (int i = 0; i < N; ++i) {
                    data_[i] = rhs[i];
                }
            }

            size_type& operator[](int index)
            {
                return data_[index];
            }

            const size_type& operator[](int index) const
            {
                return data_[index];
            }

            //! \brief Conversion operator to T*.
            operator size_type* ()             { return data_; }

            //! \brief Conversion operator to const T*.
            operator const size_type* () const { return data_; }

            operator array<size_type, N>() const
            {
                array<size_type, N> ret;

                for (int i = 0; i < N; ++i) {
                    ret[i] = data_[i];
                }
                return ret;
            }
        };
    } // namespace compatibility

    template<int N>
    using size_t = compatibility::size_t<N>;
} // namespace cl
#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY)

// Helper alias to avoid confusing the macros
namespace cl {
    namespace detail {
        using size_t_array = array<size_type, 3>;
    } // namespace detail
} // namespace cl


/*! \namespace cl
 *
 * \brief The OpenCL C++ bindings are defined within this namespace.
 *
 */
namespace cl {

#define CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(name) \
    using PFN_##name = name##_fn

#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name)                               \
    if (!pfn_##name) {                                                  \
        pfn_##name = (PFN_##name)clGetExtensionFunctionAddress(#name);  \
    }

#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name)            \
    if (!pfn_##name) {                                                  \
        pfn_##name = (PFN_##name)                                       \
            clGetExtensionFunctionAddressForPlatform(platform, #name);  \
    }

#ifdef cl_khr_external_memory
    enum class ExternalMemoryType : cl_external_memory_handle_type_khr;
#endif

    class Memory;
    class Platform;
    class Program;
    class Device;
    class Context;
    class CommandQueue;
    class DeviceCommandQueue;
    class Memory;
    class Buffer;
    class Pipe;
#ifdef cl_khr_semaphore
    class Semaphore;
#endif
#if defined(cl_khr_command_buffer)
    class CommandBufferKhr;
    class MutableCommandKhr;
#endif // cl_khr_command_buffer

#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    /*! \brief Exception class 
     * 
     *  This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined.
     */
    class Error : public std::exception
    {
    private:
        cl_int err_;
        const char * errStr_;
    public:
        /*! \brief Create a new CL error exception for a given error code
         *  and corresponding message.
         * 
         *  \param err error code value.
         *
         *  \param errStr a descriptive string that must remain in scope until
         *                handling of the exception has concluded.  If set, it
         *                will be returned by what().
         */
        Error(cl_int err, const char * errStr = nullptr) : err_(err), errStr_(errStr)
        {}

        /*! \brief Get error string associated with exception
         *
         * \return A memory pointer to the error message string.
         */
        const char * what() const noexcept override
        {
            if (errStr_ == nullptr) {
                return "empty";
            }
            else {
                return errStr_;
            }
        }

        /*! \brief Get error code associated with exception
         *
         *  \return The error code.
         */
        cl_int err(void) const { return err_; }
    };
#define CL_HPP_ERR_STR_(x) #x
#else
#define CL_HPP_ERR_STR_(x) nullptr
#endif // CL_HPP_ENABLE_EXCEPTIONS


namespace detail
{
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
static inline cl_int errHandler (
    cl_int err,
    const char * errStr = nullptr)
{
    if (err != CL_SUCCESS) {
        throw Error(err, errStr);
    }
    return err;
}
#else
static inline cl_int errHandler (cl_int err, const char * errStr = nullptr)
{
    (void) errStr; // suppress unused variable warning
    return err;
}
#endif // CL_HPP_ENABLE_EXCEPTIONS
}


//! \cond DOXYGEN_DETAIL
#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
#define __GET_DEVICE_INFO_ERR               CL_HPP_ERR_STR_(clGetDeviceInfo)
#define __GET_PLATFORM_INFO_ERR             CL_HPP_ERR_STR_(clGetPlatformInfo)
#define __GET_DEVICE_IDS_ERR                CL_HPP_ERR_STR_(clGetDeviceIDs)
#define __GET_PLATFORM_IDS_ERR              CL_HPP_ERR_STR_(clGetPlatformIDs)
#define __GET_CONTEXT_INFO_ERR              CL_HPP_ERR_STR_(clGetContextInfo)
#define __GET_EVENT_INFO_ERR                CL_HPP_ERR_STR_(clGetEventInfo)
#define __GET_EVENT_PROFILE_INFO_ERR        CL_HPP_ERR_STR_(clGetEventProfileInfo)
#define __GET_MEM_OBJECT_INFO_ERR           CL_HPP_ERR_STR_(clGetMemObjectInfo)
#define __GET_IMAGE_INFO_ERR                CL_HPP_ERR_STR_(clGetImageInfo)
#define __GET_SAMPLER_INFO_ERR              CL_HPP_ERR_STR_(clGetSamplerInfo)
#define __GET_KERNEL_INFO_ERR               CL_HPP_ERR_STR_(clGetKernelInfo)
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __GET_KERNEL_ARG_INFO_ERR           CL_HPP_ERR_STR_(clGetKernelArgInfo)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#if CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfo)
#else
#define __GET_KERNEL_SUB_GROUP_INFO_ERR     CL_HPP_ERR_STR_(clGetKernelSubGroupInfoKHR)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __GET_KERNEL_WORK_GROUP_INFO_ERR    CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo)
#define __GET_PROGRAM_INFO_ERR              CL_HPP_ERR_STR_(clGetProgramInfo)
#define __GET_PROGRAM_BUILD_INFO_ERR        CL_HPP_ERR_STR_(clGetProgramBuildInfo)
#define __GET_COMMAND_QUEUE_INFO_ERR        CL_HPP_ERR_STR_(clGetCommandQueueInfo)

#define __CREATE_CONTEXT_ERR                CL_HPP_ERR_STR_(clCreateContext)
#define __CREATE_CONTEXT_FROM_TYPE_ERR      CL_HPP_ERR_STR_(clCreateContextFromType)
#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   CL_HPP_ERR_STR_(clGetSupportedImageFormats)
#if CL_HPP_TARGET_OPENCL_VERSION >= 300
#define __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR  CL_HPP_ERR_STR_(clSetContextDestructorCallback)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300

#define __CREATE_BUFFER_ERR                 CL_HPP_ERR_STR_(clCreateBuffer)
#define __COPY_ERR                          CL_HPP_ERR_STR_(cl::copy)
#define __CREATE_SUBBUFFER_ERR              CL_HPP_ERR_STR_(clCreateSubBuffer)
#define __CREATE_GL_BUFFER_ERR              CL_HPP_ERR_STR_(clCreateFromGLBuffer)
#define __CREATE_GL_RENDER_BUFFER_ERR       CL_HPP_ERR_STR_(clCreateFromGLBuffer)
#define __GET_GL_OBJECT_INFO_ERR            CL_HPP_ERR_STR_(clGetGLObjectInfo)
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_IMAGE_ERR                  CL_HPP_ERR_STR_(clCreateImage)
#define __CREATE_GL_TEXTURE_ERR             CL_HPP_ERR_STR_(clCreateFromGLTexture)
#define __IMAGE_DIMENSION_ERR               CL_HPP_ERR_STR_(Incorrect image dimensions)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback)

#define __CREATE_USER_EVENT_ERR             CL_HPP_ERR_STR_(clCreateUserEvent)
#define __SET_USER_EVENT_STATUS_ERR         CL_HPP_ERR_STR_(clSetUserEventStatus)
#define __SET_EVENT_CALLBACK_ERR            CL_HPP_ERR_STR_(clSetEventCallback)
#define __WAIT_FOR_EVENTS_ERR               CL_HPP_ERR_STR_(clWaitForEvents)

#define __CREATE_KERNEL_ERR                 CL_HPP_ERR_STR_(clCreateKernel)
#define __SET_KERNEL_ARGS_ERR               CL_HPP_ERR_STR_(clSetKernelArg)
#define __CREATE_PROGRAM_WITH_SOURCE_ERR    CL_HPP_ERR_STR_(clCreateProgramWithSource)
#define __CREATE_PROGRAM_WITH_BINARY_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBinary)
#if CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithIL)
#else
#define __CREATE_PROGRAM_WITH_IL_ERR        CL_HPP_ERR_STR_(clCreateProgramWithILKHR)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __BUILD_PROGRAM_ERR                 CL_HPP_ERR_STR_(clBuildProgram)
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __COMPILE_PROGRAM_ERR               CL_HPP_ERR_STR_(clCompileProgram)
#define __LINK_PROGRAM_ERR                  CL_HPP_ERR_STR_(clLinkProgram)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_KERNELS_IN_PROGRAM_ERR     CL_HPP_ERR_STR_(clCreateKernelsInProgram)

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties)
#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR                CL_HPP_ERR_STR_(clCreateSamplerWithProperties)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#define __SET_COMMAND_QUEUE_PROPERTY_ERR    CL_HPP_ERR_STR_(clSetCommandQueueProperty)
#define __ENQUEUE_READ_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueReadBuffer)
#define __ENQUEUE_READ_BUFFER_RECT_ERR      CL_HPP_ERR_STR_(clEnqueueReadBufferRect)
#define __ENQUEUE_WRITE_BUFFER_ERR          CL_HPP_ERR_STR_(clEnqueueWriteBuffer)
#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     CL_HPP_ERR_STR_(clEnqueueWriteBufferRect)
#define __ENQEUE_COPY_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueCopyBuffer)
#define __ENQEUE_COPY_BUFFER_RECT_ERR       CL_HPP_ERR_STR_(clEnqueueCopyBufferRect)
#define __ENQUEUE_FILL_BUFFER_ERR           CL_HPP_ERR_STR_(clEnqueueFillBuffer)
#define __ENQUEUE_READ_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueReadImage)
#define __ENQUEUE_WRITE_IMAGE_ERR           CL_HPP_ERR_STR_(clEnqueueWriteImage)
#define __ENQUEUE_COPY_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueCopyImage)
#define __ENQUEUE_FILL_IMAGE_ERR            CL_HPP_ERR_STR_(clEnqueueFillImage)
#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer)
#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage)
#define __ENQUEUE_MAP_BUFFER_ERR            CL_HPP_ERR_STR_(clEnqueueMapBuffer)
#define __ENQUEUE_MAP_SVM_ERR               CL_HPP_ERR_STR_(clEnqueueSVMMap)
#define __ENQUEUE_FILL_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemFill)
#define __ENQUEUE_COPY_SVM_ERR              CL_HPP_ERR_STR_(clEnqueueSVMMemcpy)
#define __ENQUEUE_UNMAP_SVM_ERR             CL_HPP_ERR_STR_(clEnqueueSVMUnmap)
#define __ENQUEUE_MAP_IMAGE_ERR             CL_HPP_ERR_STR_(clEnqueueMapImage)
#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      CL_HPP_ERR_STR_(clEnqueueUnMapMemObject)
#define __ENQUEUE_NDRANGE_KERNEL_ERR        CL_HPP_ERR_STR_(clEnqueueNDRangeKernel)
#define __ENQUEUE_NATIVE_KERNEL             CL_HPP_ERR_STR_(clEnqueueNativeKernel)
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#if CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __ENQUEUE_MIGRATE_SVM_ERR   CL_HPP_ERR_STR_(clEnqueueSVMMigrateMem)
#define __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR   CL_HPP_ERR_STR_(clSetDefaultDeviceCommandQueue)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210


#define __ENQUEUE_ACQUIRE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects)
#define __ENQUEUE_RELEASE_GL_ERR            CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects)

#define __CREATE_PIPE_ERR             CL_HPP_ERR_STR_(clCreatePipe)
#define __GET_PIPE_INFO_ERR           CL_HPP_ERR_STR_(clGetPipeInfo)

#define __RETAIN_ERR                        CL_HPP_ERR_STR_(Retain Object)
#define __RELEASE_ERR                       CL_HPP_ERR_STR_(Release Object)
#define __FLUSH_ERR                         CL_HPP_ERR_STR_(clFlush)
#define __FINISH_ERR                        CL_HPP_ERR_STR_(clFinish)
#define __VECTOR_CAPACITY_ERR               CL_HPP_ERR_STR_(Vector capacity error)

#if CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __GET_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetHostTimer)
#define __GET_DEVICE_AND_HOST_TIMER_ERR           CL_HPP_ERR_STR_(clGetDeviceAndHostTimer)
#endif
#if CL_HPP_TARGET_OPENCL_VERSION >= 220
#define __SET_PROGRAM_RELEASE_CALLBACK_ERR          CL_HPP_ERR_STR_(clSetProgramReleaseCallback)
#define __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR   CL_HPP_ERR_STR_(clSetProgramSpecializationConstant)
#endif

#ifdef cl_khr_external_memory
#define __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueAcquireExternalMemObjectsKHR)
#define __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR       CL_HPP_ERR_STR_(clEnqueueReleaseExternalMemObjectsKHR)
#endif

#ifdef cl_khr_semaphore
#define __GET_SEMAPHORE_KHR_INFO_ERR                CL_HPP_ERR_STR_(clGetSemaphoreInfoKHR)
#define __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR  CL_HPP_ERR_STR_(clCreateSemaphoreWithPropertiesKHR)
#define __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueWaitSemaphoresKHR)
#define __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR          CL_HPP_ERR_STR_(clEnqueueSignalSemaphoresKHR)
#define __RETAIN_SEMAPHORE_KHR_ERR                  CL_HPP_ERR_STR_(clRetainSemaphoreKHR)
#define __RELEASE_SEMAPHORE_KHR_ERR                 CL_HPP_ERR_STR_(clReleaseSemaphoreKHR)
#endif

#ifdef cl_khr_external_semaphore
#define __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR         CL_HPP_ERR_STR_(clGetSemaphoreHandleForTypeKHR)
#endif // cl_khr_external_semaphore

#if defined(cl_khr_command_buffer)
#define __CREATE_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clCreateCommandBufferKHR)
#define __GET_COMMAND_BUFFER_INFO_KHR_ERR           CL_HPP_ERR_STR_(clGetCommandBufferInfoKHR)
#define __FINALIZE_COMMAND_BUFFER_KHR_ERR           CL_HPP_ERR_STR_(clFinalizeCommandBufferKHR)
#define __ENQUEUE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clEnqueueCommandBufferKHR)
#define __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR    CL_HPP_ERR_STR_(clCommandBarrierWithWaitListKHR)
#define __COMMAND_COPY_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandCopyBufferKHR)
#define __COMMAND_COPY_BUFFER_RECT_KHR_ERR          CL_HPP_ERR_STR_(clCommandCopyBufferRectKHR)
#define __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyBufferToImageKHR)
#define __COMMAND_COPY_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandCopyImageKHR)
#define __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR      CL_HPP_ERR_STR_(clCommandCopyImageToBufferKHR)
#define __COMMAND_FILL_BUFFER_KHR_ERR               CL_HPP_ERR_STR_(clCommandFillBufferKHR)
#define __COMMAND_FILL_IMAGE_KHR_ERR                CL_HPP_ERR_STR_(clCommandFillImageKHR)
#define __COMMAND_NDRANGE_KERNEL_KHR_ERR            CL_HPP_ERR_STR_(clCommandNDRangeKernelKHR)
#define __UPDATE_MUTABLE_COMMANDS_KHR_ERR           CL_HPP_ERR_STR_(clUpdateMutableCommandsKHR)
#define __GET_MUTABLE_COMMAND_INFO_KHR_ERR          CL_HPP_ERR_STR_(clGetMutableCommandInfoKHR)
#define __RETAIN_COMMAND_BUFFER_KHR_ERR             CL_HPP_ERR_STR_(clRetainCommandBufferKHR)
#define __RELEASE_COMMAND_BUFFER_KHR_ERR            CL_HPP_ERR_STR_(clReleaseCommandBufferKHR)
#endif // cl_khr_command_buffer

#if defined(cl_ext_image_requirements_info)
#define __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR            CL_HPP_ERR_STR_(clGetImageRequirementsInfoEXT)
#endif //cl_ext_image_requirements_info

/**
 * CL 1.2 version that uses device fission.
 */
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevices)
#else
#define __CREATE_SUB_DEVICES_ERR            CL_HPP_ERR_STR_(clCreateSubDevicesEXT)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

/**
 * Deprecated APIs for 1.2
 */
#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define __ENQUEUE_MARKER_ERR                CL_HPP_ERR_STR_(clEnqueueMarker)
#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       CL_HPP_ERR_STR_(clEnqueueWaitForEvents)
#define __ENQUEUE_BARRIER_ERR               CL_HPP_ERR_STR_(clEnqueueBarrier)
#define __UNLOAD_COMPILER_ERR               CL_HPP_ERR_STR_(clUnloadCompiler)
#define __CREATE_GL_TEXTURE_2D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture2D)
#define __CREATE_GL_TEXTURE_3D_ERR          CL_HPP_ERR_STR_(clCreateFromGLTexture3D)
#define __CREATE_IMAGE2D_ERR                CL_HPP_ERR_STR_(clCreateImage2D)
#define __CREATE_IMAGE3D_ERR                CL_HPP_ERR_STR_(clCreateImage3D)
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)

/**
 * Deprecated APIs for 2.0
 */
#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define __CREATE_COMMAND_QUEUE_ERR          CL_HPP_ERR_STR_(clCreateCommandQueue)
#define __ENQUEUE_TASK_ERR                  CL_HPP_ERR_STR_(clEnqueueTask)
#define __CREATE_SAMPLER_ERR                CL_HPP_ERR_STR_(clCreateSampler)
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)

/**
 * CL 1.2 marker and barrier commands
 */
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
#define __ENQUEUE_MARKER_WAIT_LIST_ERR                CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList)
#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

#if CL_HPP_TARGET_OPENCL_VERSION >= 210
#define __CLONE_KERNEL_ERR     CL_HPP_ERR_STR_(clCloneKernel)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210

#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS
//! \endcond

#ifdef cl_khr_external_memory
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueAcquireExternalMemObjectsKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueReleaseExternalMemObjectsKHR);

CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueAcquireExternalMemObjectsKHR pfn_clEnqueueAcquireExternalMemObjectsKHR = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueReleaseExternalMemObjectsKHR pfn_clEnqueueReleaseExternalMemObjectsKHR = nullptr;
#endif // cl_khr_external_memory

#ifdef cl_khr_semaphore
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSemaphoreWithPropertiesKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseSemaphoreKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainSemaphoreKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueWaitSemaphoresKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueSignalSemaphoresKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreInfoKHR);

CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSemaphoreWithPropertiesKHR pfn_clCreateSemaphoreWithPropertiesKHR  = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseSemaphoreKHR              pfn_clReleaseSemaphoreKHR               = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainSemaphoreKHR               pfn_clRetainSemaphoreKHR                = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueWaitSemaphoresKHR         pfn_clEnqueueWaitSemaphoresKHR          = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueSignalSemaphoresKHR       pfn_clEnqueueSignalSemaphoresKHR        = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreInfoKHR              pfn_clGetSemaphoreInfoKHR               = nullptr;
#endif // cl_khr_semaphore

#ifdef cl_khr_external_semaphore
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetSemaphoreHandleForTypeKHR);
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetSemaphoreHandleForTypeKHR     pfn_clGetSemaphoreHandleForTypeKHR      = nullptr;
#endif // cl_khr_external_semaphore

#if defined(cl_khr_command_buffer)
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateCommandBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clFinalizeCommandBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clRetainCommandBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clReleaseCommandBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetCommandBufferInfoKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clEnqueueCommandBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandBarrierWithWaitListKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferRectKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyBufferToImageKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandCopyImageToBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillBufferKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandFillImageKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCommandNDRangeKernelKHR);

CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateCommandBufferKHR pfn_clCreateCommandBufferKHR               = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clFinalizeCommandBufferKHR pfn_clFinalizeCommandBufferKHR           = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clRetainCommandBufferKHR pfn_clRetainCommandBufferKHR               = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clReleaseCommandBufferKHR pfn_clReleaseCommandBufferKHR             = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetCommandBufferInfoKHR pfn_clGetCommandBufferInfoKHR             = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clEnqueueCommandBufferKHR pfn_clEnqueueCommandBufferKHR             = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandBarrierWithWaitListKHR pfn_clCommandBarrierWithWaitListKHR = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferKHR pfn_clCommandCopyBufferKHR                   = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferRectKHR pfn_clCommandCopyBufferRectKHR           = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyBufferToImageKHR pfn_clCommandCopyBufferToImageKHR     = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageKHR pfn_clCommandCopyImageKHR                     = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandCopyImageToBufferKHR pfn_clCommandCopyImageToBufferKHR     = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillBufferKHR pfn_clCommandFillBufferKHR                   = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandFillImageKHR pfn_clCommandFillImageKHR                     = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCommandNDRangeKernelKHR pfn_clCommandNDRangeKernelKHR             = nullptr;
#endif /* cl_khr_command_buffer */

#if defined(cl_khr_command_buffer_mutable_dispatch)
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clUpdateMutableCommandsKHR);
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetMutableCommandInfoKHR);

CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clUpdateMutableCommandsKHR pfn_clUpdateMutableCommandsKHR           = nullptr;
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetMutableCommandInfoKHR pfn_clGetMutableCommandInfoKHR           = nullptr;
#endif /* cl_khr_command_buffer_mutable_dispatch */

#if defined(cl_ext_image_requirements_info)
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clGetImageRequirementsInfoEXT);
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clGetImageRequirementsInfoEXT pfn_clGetImageRequirementsInfoEXT  = nullptr;
#endif

#if defined(cl_ext_device_fission)
CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_(clCreateSubDevicesEXT);
CL_HPP_DEFINE_STATIC_MEMBER_ PFN_clCreateSubDevicesEXT
    pfn_clCreateSubDevicesEXT = nullptr;
#endif

namespace detail {

// Generic getInfoHelper. The final parameter is used to guide overload
// resolution: the actual parameter passed is an int, which makes this
// a worse conversion sequence than a specialization that declares the
// parameter as an int.
template<typename Functor, typename T>
inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
{
    return f(name, sizeof(T), param, nullptr);
}

// Specialized for getInfo<CL_PROGRAM_BINARIES>
// Assumes that the output vector was correctly resized on the way in
template <typename Func>
inline cl_int getInfoHelper(Func f, cl_uint name, vector<vector<unsigned char>>* param, int)
{
    if (name != CL_PROGRAM_BINARIES) {
        return CL_INVALID_VALUE;
    }
    if (param) {
        // Create array of pointers, calculate total size and pass pointer array in
        size_type numBinaries = param->size();
        vector<unsigned char*> binariesPointers(numBinaries);

        for (size_type i = 0; i < numBinaries; ++i)
        {
            binariesPointers[i] = (*param)[i].data();
        }

        cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), nullptr);

        if (err != CL_SUCCESS) {
            return err;
        }
    }

    return CL_SUCCESS;
}

// Specialized getInfoHelper for vector params
template <typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, vector<T>* param, long)
{
    size_type required;
    cl_int err = f(name, 0, nullptr, &required);
    if (err != CL_SUCCESS) {
        return err;
    }
    const size_type elements = required / sizeof(T);

    // Temporary to avoid changing param on an error
    vector<T> localData(elements);
    err = f(name, required, localData.data(), nullptr);
    if (err != CL_SUCCESS) {
        return err;
    }
    if (param) {
        *param = std::move(localData);
    }

    return CL_SUCCESS;
}

/* Specialization for reference-counted types. This depends on the
 * existence of Wrapper<T>::cl_type, and none of the other types having the
 * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
 * does not work, because when using a derived type (e.g. Context) the generic
 * template will provide a better match.
 */
template <typename Func, typename T>
inline cl_int getInfoHelper(
    Func f, cl_uint name, vector<T>* param, int, typename T::cl_type = 0)
{
    size_type required;
    cl_int err = f(name, 0, nullptr, &required);
    if (err != CL_SUCCESS) {
        return err;
    }

    const size_type elements = required / sizeof(typename T::cl_type);

    vector<typename T::cl_type> value(elements);
    err = f(name, required, value.data(), nullptr);
    if (err != CL_SUCCESS) {
        return err;
    }

    if (param) {
        // Assign to convert CL type to T for each element
        param->resize(elements);

        // Assign to param, constructing with retain behaviour
        // to correctly capture each underlying CL object
        for (size_type i = 0; i < elements; i++) {
            (*param)[i] = T(value[i], true);
        }
    }
    return CL_SUCCESS;
}

// Specialized GetInfoHelper for string params
template <typename Func>
inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long)
{
    size_type required;
    cl_int err = f(name, 0, nullptr, &required);
    if (err != CL_SUCCESS) {
        return err;
    }

    // std::string has a constant data member
    // a char vector does not
    if (required > 0) {
        vector<char> value(required);
        err = f(name, required, value.data(), nullptr);
        if (err != CL_SUCCESS) {
            return err;
        }
        if (param) {
            param->assign(value.begin(), value.end() - 1);
        }
    }
    else if (param) {
        param->assign("");
    }
    return CL_SUCCESS;
}

// Specialized GetInfoHelper for clsize_t params
template <typename Func, size_type N>
inline cl_int getInfoHelper(Func f, cl_uint name, array<size_type, N>* param, long)
{
    size_type required;
    cl_int err = f(name, 0, nullptr, &required);
    if (err != CL_SUCCESS) {
        return err;
    }

    size_type elements = required / sizeof(size_type);
    vector<size_type> value(elements, 0);

    err = f(name, required, value.data(), nullptr);
    if (err != CL_SUCCESS) {
        return err;
    }
    
    // Bound the copy with N to prevent overruns
    // if passed N > than the amount copied
    if (elements > N) {
        elements = N;
    }
    for (size_type i = 0; i < elements; ++i) {
        (*param)[i] = value[i];
    }

    return CL_SUCCESS;
}

template<typename T> struct ReferenceHandler;

/* Specialization for reference-counted types. This depends on the
 * existence of Wrapper<T>::cl_type, and none of the other types having the
 * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
 * does not work, because when using a derived type (e.g. Context) the generic
 * template will provide a better match.
 */
template<typename Func, typename T>
inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
{
    typename T::cl_type value;
    cl_int err = f(name, sizeof(value), &value, nullptr);
    if (err != CL_SUCCESS) {
        return err;
    }
    *param = value;
    if (value != nullptr)
    {
        err = param->retain();
        if (err != CL_SUCCESS) {
            return err;
        }
    }
    return CL_SUCCESS;
}

#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \
    F(cl_platform_info, CL_PLATFORM_PROFILE, string) \
    F(cl_platform_info, CL_PLATFORM_VERSION, string) \
    F(cl_platform_info, CL_PLATFORM_NAME, string) \
    F(cl_platform_info, CL_PLATFORM_VENDOR, string) \
    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \
    \
    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \
    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector<size_type>) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \
    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \
    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
    F(cl_device_info, CL_DEVICE_PLATFORM, cl::Platform) \
    F(cl_device_info, CL_DEVICE_NAME, string) \
    F(cl_device_info, CL_DEVICE_VENDOR, string) \
    F(cl_device_info, CL_DRIVER_VERSION, string) \
    F(cl_device_info, CL_DEVICE_PROFILE, string) \
    F(cl_device_info, CL_DEVICE_VERSION, string) \
    F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \
    \
    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
    F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector<Device>) \
    F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector<cl_context_properties>) \
    \
    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \
    \
    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
    \
    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
    F(cl_mem_info, CL_MEM_SIZE, size_type) \
    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
    \
    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \
    F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \
    F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \
    F(cl_image_info, CL_IMAGE_WIDTH, size_type) \
    F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \
    F(cl_image_info, CL_IMAGE_DEPTH, size_type) \
    \
    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \
    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \
    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \
    \
    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
    F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector<Device>) \
    F(cl_program_info, CL_PROGRAM_SOURCE, string) \
    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector<size_type>) \
    F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector<cl::vector<unsigned char>>) \
    \
    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \
    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \
    \
    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \
    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
    \
    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \
    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \
    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
    \
    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)


#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \
    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \
    \
    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
    F(cl_mem_info, CL_MEM_OFFSET, size_type) \
    \
    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
    \
    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)

#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \
    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \
    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \
    \
    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
    \
    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \
    \
    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \
    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \
    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \
    \
    F(cl_kernel_work_group_info, CL_KERNEL_GLOBAL_WORK_SIZE, cl::detail::size_t_array) \
    \
    F(cl_device_info, CL_DEVICE_LINKER_AVAILABLE, cl_bool) \
    F(cl_device_info, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, size_type) \
    F(cl_device_info, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, size_type) \
    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \
    F(cl_device_info, CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) \
    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector<cl_device_partition_property>) \
    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector<cl_device_partition_property>)  \
    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool) \
    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \
    F(cl_device_info, CL_DEVICE_PRINTF_BUFFER_SIZE, size_type) \
    \
    F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \
    F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \
    F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint)

#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \
    F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \
    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \
    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \
    F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \
    F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \
    F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \
    F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \
    F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \
    F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \
    F(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT, cl_uint) \
    F(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, cl_uint) \
    F(cl_device_info, CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS, cl_uint ) \
    F(cl_device_info, CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE, size_type ) \
    F(cl_device_info, CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE, size_type ) \
    F(cl_profiling_info, CL_PROFILING_COMMAND_COMPLETE, cl_ulong) \
    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, cl_bool) \
    F(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_SVM_PTRS, void**) \
    F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \
    F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \
    F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \
    F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \
    F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint)

#define CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(F) \
    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, size_type) \
    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, size_type)

#define CL_HPP_PARAM_NAME_INFO_IL_KHR_(F) \
    F(cl_device_info, CL_DEVICE_IL_VERSION_KHR, string) \
    F(cl_program_info, CL_PROGRAM_IL_KHR, cl::vector<unsigned char>)

#define CL_HPP_PARAM_NAME_INFO_2_1_(F) \
    F(cl_platform_info, CL_PLATFORM_HOST_TIMER_RESOLUTION, cl_ulong) \
    F(cl_program_info, CL_PROGRAM_IL, cl::vector<unsigned char>) \
    F(cl_device_info, CL_DEVICE_MAX_NUM_SUB_GROUPS, cl_uint) \
    F(cl_device_info, CL_DEVICE_IL_VERSION, string) \
    F(cl_device_info, CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS, cl_bool) \
    F(cl_command_queue_info, CL_QUEUE_DEVICE_DEFAULT, cl::DeviceCommandQueue) \
    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, size_type) \
    F(cl_kernel_sub_group_info, CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE, size_type) \
    F(cl_kernel_sub_group_info, CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT, cl::detail::size_t_array) \
    F(cl_kernel_sub_group_info, CL_KERNEL_MAX_NUM_SUB_GROUPS, size_type) \
    F(cl_kernel_sub_group_info, CL_KERNEL_COMPILE_NUM_SUB_GROUPS, size_type)

#define CL_HPP_PARAM_NAME_INFO_2_2_(F) \
    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT, cl_bool) \
    F(cl_program_info, CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT, cl_bool)

#define CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(F) \
    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl::Device) \
    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector<cl_device_partition_property_ext>) \
    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector<cl_device_partition_property_ext>) \
    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector<cl_device_partition_property_ext>)

#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(F) \
    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION_KHR, cl_version_khr) \
    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
    \
    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION_KHR, cl_version_khr) \
    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>) \
    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR, cl::vector<cl_name_version_khr>)

#define CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(F) \
    F(cl_device_info, CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR, cl_version_khr)

// Note: the query for CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR is handled specially!
#define CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(F) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_CONTEXT_KHR, cl::Context) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_REFERENCE_COUNT_KHR, cl_uint) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_PROPERTIES_KHR, cl::vector<cl_semaphore_properties_khr>) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_TYPE_KHR, cl_semaphore_type_khr) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_PAYLOAD_KHR, cl_semaphore_payload_khr) \
    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_TYPES_KHR,  cl::vector<cl_semaphore_type_khr>) \
    F(cl_device_info, CL_DEVICE_SEMAPHORE_TYPES_KHR,      cl::vector<cl_semaphore_type_khr>) \

#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(F) \
    F(cl_device_info, CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>) \
    F(cl_platform_info, CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR, cl::vector<cl::ExternalMemoryType>)

#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(F) \
    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \
    F(cl_platform_info, CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,  cl::vector<cl_external_semaphore_handle_type_khr>) \
    F(cl_device_info, CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \
    F(cl_device_info, CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \
    F(cl_semaphore_info_khr, CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR,      cl::vector<cl_external_semaphore_handle_type_khr>) \

#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(F) \
    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR, int) \

#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(F) \
    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_SYNC_FD_KHR, int) \

#define CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(F) \
    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR, void*) \
    F(cl_external_semaphore_handle_type_khr, CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR, void*) \

#define CL_HPP_PARAM_NAME_INFO_3_0_(F) \
    F(cl_platform_info, CL_PLATFORM_NUMERIC_VERSION, cl_version) \
    F(cl_platform_info, CL_PLATFORM_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
    \
    F(cl_device_info, CL_DEVICE_NUMERIC_VERSION, cl_version) \
    F(cl_device_info, CL_DEVICE_EXTENSIONS_WITH_VERSION, cl::vector<cl_name_version>) \
    F(cl_device_info, CL_DEVICE_ILS_WITH_VERSION, cl::vector<cl_name_version>) \
    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION, cl::vector<cl_name_version>) \
    F(cl_device_info, CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES, cl_device_atomic_capabilities) \
    F(cl_device_info, CL_DEVICE_ATOMIC_FENCE_CAPABILITIES, cl_device_atomic_capabilities) \
    F(cl_device_info, CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_OPENCL_C_ALL_VERSIONS, cl::vector<cl_name_version>) \
    F(cl_device_info, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \
    F(cl_device_info, CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_OPENCL_C_FEATURES, cl::vector<cl_name_version>) \
    F(cl_device_info, CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES, cl_device_device_enqueue_capabilities) \
    F(cl_device_info, CL_DEVICE_PIPE_SUPPORT, cl_bool) \
    F(cl_device_info, CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED, string) \
    \
    F(cl_command_queue_info, CL_QUEUE_PROPERTIES_ARRAY, cl::vector<cl_queue_properties>) \
    F(cl_mem_info, CL_MEM_PROPERTIES, cl::vector<cl_mem_properties>) \
    F(cl_pipe_info, CL_PIPE_PROPERTIES, cl::vector<cl_pipe_properties>) \
    F(cl_sampler_info, CL_SAMPLER_PROPERTIES, cl::vector<cl_sampler_properties>) \

#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(F) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT, size_type) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT, size_type) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SIZE_EXT, size_type) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT, cl_uint) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT, cl_uint) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT, cl_uint) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT, cl_uint) \

#define CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(F) \
    F(cl_image_requirements_info_ext, CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT, size_type) \

#define CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(F) \
    F(cl_device_info, CL_DEVICE_QUEUE_FAMILY_PROPERTIES_INTEL, cl::vector<cl_queue_family_properties_intel>) \
    \
    F(cl_command_queue_info, CL_QUEUE_FAMILY_INTEL, cl_uint) \
    F(cl_command_queue_info, CL_QUEUE_INDEX_INTEL, cl_uint)

#define CL_HPP_PARAM_NAME_CL_INTEL_UNIFIED_SHARED_MEMORY_(F) \
    F(cl_device_info, CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \
    F(cl_device_info, CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \
    F(cl_device_info, CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \
    F(cl_device_info, CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel ) \
    F(cl_device_info, CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL, cl_device_unified_shared_memory_capabilities_intel )

template <typename enum_type, cl_int Name>
struct param_traits {};

#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \
struct token;                                        \
template<>                                           \
struct param_traits<detail:: token,param_name>       \
{                                                    \
    enum { value = param_name };                     \
    typedef T param_type;                            \
};

CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#if CL_HPP_TARGET_OPENCL_VERSION >= 110
CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_TARGET_OPENCL_VERSION >= 210
CL_HPP_PARAM_NAME_INFO_2_1_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 210
#if CL_HPP_TARGET_OPENCL_VERSION >= 220
CL_HPP_PARAM_NAME_INFO_2_2_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
#if CL_HPP_TARGET_OPENCL_VERSION >= 300
CL_HPP_PARAM_NAME_INFO_3_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300

#if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210
CL_HPP_PARAM_NAME_INFO_SUBGROUP_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // #if defined(cl_khr_subgroups) && CL_HPP_TARGET_OPENCL_VERSION < 210

#if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210
CL_HPP_PARAM_NAME_INFO_IL_KHR_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // #if defined(cl_khr_il_program) && CL_HPP_TARGET_OPENCL_VERSION < 210


// Flags deprecated in OpenCL 2.0
#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \
    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties)

#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \
    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool)

#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \
    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer)

// Include deprecated query flags based on versions
// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash
#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200
CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110
#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200

#if defined(cl_ext_device_fission)
CL_HPP_PARAM_NAME_DEVICE_FISSION_EXT_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_ext_device_fission

#if defined(cl_khr_extended_versioning)
#if CL_HPP_TARGET_OPENCL_VERSION < 300
CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_CL3_SHARED_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // CL_HPP_TARGET_OPENCL_VERSION < 300
CL_HPP_PARAM_NAME_CL_KHR_EXTENDED_VERSIONING_KHRONLY_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_extended_versioning

#if defined(cl_khr_semaphore)
CL_HPP_PARAM_NAME_CL_KHR_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)
#if defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_semaphore_info_khr, CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR, cl::vector<cl::Device>)
#endif // defined(CL_SEMAPHORE_DEVICE_HANDLE_LIST_KHR)
#endif // defined(cl_khr_semaphore)

#ifdef cl_khr_external_memory
CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_external_memory

#if defined(cl_khr_external_semaphore)
CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_external_semaphore

#if defined(cl_khr_external_semaphore_opaque_fd)
CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_external_semaphore_opaque_fd
#if defined(cl_khr_external_semaphore_sync_fd)
CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_external_semaphore_sync_fd
#if defined(cl_khr_external_semaphore_win32)
CL_HPP_PARAM_NAME_CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_khr_external_semaphore_win32

#if defined(cl_khr_device_uuid)
using uuid_array = array<cl_uchar, CL_UUID_SIZE_KHR>;
using luid_array = array<cl_uchar, CL_LUID_SIZE_KHR>;
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_UUID_KHR, uuid_array)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DRIVER_UUID_KHR, uuid_array)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_VALID_KHR, cl_bool)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LUID_KHR, luid_array)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NODE_MASK_KHR, cl_uint)
#endif

#if defined(cl_khr_pci_bus_info)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PCI_BUS_INFO_KHR, cl_device_pci_bus_info_khr)
#endif

// Note: some headers do not define cl_khr_image2d_from_buffer
#if CL_HPP_TARGET_OPENCL_VERSION < 200
#if defined(CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR, cl_uint)
#endif
#if defined(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR, cl_uint)
#endif
#endif // CL_HPP_TARGET_OPENCL_VERSION < 200

#if defined(cl_khr_integer_dot_product)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_CAPABILITIES_KHR, cl_device_integer_dot_product_capabilities_khr)
#if defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_4x8BIT_PACKED_KHR, cl_device_integer_dot_product_acceleration_properties_khr)
#endif // defined(CL_DEVICE_INTEGER_DOT_PRODUCT_ACCELERATION_PROPERTIES_8BIT_KHR)
#endif // defined(cl_khr_integer_dot_product)

#if defined(cl_ext_image_requirements_info)
CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_ext_image_requirements_info

#if defined(cl_ext_image_from_buffer)
CL_HPP_PARAM_NAME_CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_ext_image_from_buffer

#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string)
#endif

#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
#endif
#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector<size_type>)
#endif
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_WIDTH_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
#endif
#ifdef CL_DEVICE_BOARD_NAME_AMD
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_BOARD_NAME_AMD, string)
#endif

#ifdef CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM, cl_ulong)
#endif
#ifdef CL_DEVICE_JOB_SLOTS_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_JOB_SLOTS_ARM, cl_uint)
#endif
#ifdef CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, cl_bitfield)
#endif
#ifdef CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUPPORTED_REGISTER_ALLOCATIONS_ARM, vector<cl_uint>)
#endif
#ifdef CL_DEVICE_MAX_WARP_COUNT_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_MAX_WARP_COUNT_ARM, cl_uint)
#endif
#ifdef CL_KERNEL_MAX_WARP_COUNT_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_info, CL_KERNEL_MAX_WARP_COUNT_ARM, cl_uint)
#endif
#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM, cl_uint)
#endif
#ifdef CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM, cl_int)
#endif
#ifdef CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_WARP_COUNT_LIMIT_ARM, cl_uint)
#endif
#ifdef CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_exec_info, CL_KERNEL_EXEC_INFO_COMPUTE_UNIT_MAX_QUEUED_BATCHES_ARM, cl_uint)
#endif

#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
#endif
#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
#endif
#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
#endif
#ifdef CL_DEVICE_WARP_SIZE_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
#endif
#ifdef CL_DEVICE_GPU_OVERLAP_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
#endif
#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
#endif
#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
#endif

#if defined(cl_khr_command_buffer)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR, cl_device_command_buffer_capabilities_khr)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR, cl_command_queue_properties)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_QUEUES_KHR, cl::vector<CommandQueue>)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_NUM_QUEUES_KHR, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_STATE_KHR, cl_command_buffer_state_khr)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_command_buffer_info_khr, CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_buffer_properties_khr>)
#endif /* cl_khr_command_buffer */

#if defined(cl_khr_command_buffer_mutable_dispatch)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR, CommandQueue)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR, CommandBufferKhr)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR, cl_command_type)

#if CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 2)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR, cl::vector<cl_command_properties_khr>)
#else
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_PROPERTIES_ARRAY_KHR, cl::vector<cl_ndrange_kernel_command_properties_khr>)
#endif
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_KERNEL_KHR, cl_kernel)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_DIMENSIONS_KHR, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR, cl::vector<size_type>)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR, cl::vector<size_type>)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR, cl::vector<size_type>)
#endif /* cl_khr_command_buffer_mutable_dispatch */

#if defined(cl_khr_kernel_clock)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)
#endif /* cl_khr_kernel_clock */

#if defined(cl_ext_float_atomics)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
#endif /* cl_ext_float_atomics */

#if defined(cl_intel_command_queue_families)
CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_intel_command_queue_families

#if defined(cl_intel_device_attribute_query)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_IP_VERSION_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_ID_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SLICES_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_NUM_THREADS_PER_EU_INTEL, cl_uint)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_FEATURE_CAPABILITIES_INTEL, cl_device_feature_capabilities_intel)
#endif // cl_intel_device_attribute_query

#if defined(cl_intel_required_subgroup_size)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SUB_GROUP_SIZES_INTEL, cl::vector<size_type>)
CL_HPP_DECLARE_PARAM_TRAITS_(cl_kernel_work_group_info, CL_KERNEL_SPILL_MEM_SIZE_INTEL, cl_ulong)
#endif // cl_intel_required_subgroup_size

#if defined(cl_intel_unified_shared_memory)
CL_HPP_PARAM_NAME_CL_INTEL_UNIFIED_SHARED_MEMORY_(CL_HPP_DECLARE_PARAM_TRAITS_)
#endif // cl_intel_unified_shared_memory

// Convenience functions

template <typename Func, typename T>
inline cl_int
getInfo(Func f, cl_uint name, T* param)
{
    return getInfoHelper(f, name, param, 0);
}

template <typename Func, typename Arg0>
struct GetInfoFunctor0
{
    Func f_; const Arg0& arg0_;
    cl_int operator ()(
        cl_uint param, size_type size, void* value, size_type* size_ret)
    { return f_(arg0_, param, size, value, size_ret); }
};

template <typename Func, typename Arg0, typename Arg1>
struct GetInfoFunctor1
{
    Func f_; const Arg0& arg0_; const Arg1& arg1_;
    cl_int operator ()(
        cl_uint param, size_type size, void* value, size_type* size_ret)
    { return f_(arg0_, arg1_, param, size, value, size_ret); }
};

template <typename Func, typename Arg0, typename T>
inline cl_int
getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
{
    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
    return getInfoHelper(f0, name, param, 0);
}

template <typename Func, typename Arg0, typename Arg1, typename T>
inline cl_int
getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
{
    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
    return getInfoHelper(f0, name, param, 0);
}


template<typename T>
struct ReferenceHandler
{ };

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
/**
 * OpenCL 1.2 devices do have retain/release.
 */
template <>
struct ReferenceHandler<cl_device_id>
{
    /**
     * Retain the device.
     * \param device A valid device created using createSubDevices
     * \return 
     *   CL_SUCCESS if the function executed successfully.
     *   CL_INVALID_DEVICE if device was not a valid subdevice
     *   CL_OUT_OF_RESOURCES
     *   CL_OUT_OF_HOST_MEMORY
     */
    static cl_int retain(cl_device_id device)
    { return ::clRetainDevice(device); }
    /**
     * Retain the device.
     * \param device A valid device created using createSubDevices
     * \return 
     *   CL_SUCCESS if the function executed successfully.
     *   CL_INVALID_DEVICE if device was not a valid subdevice
     *   CL_OUT_OF_RESOURCES
     *   CL_OUT_OF_HOST_MEMORY
     */
    static cl_int release(cl_device_id device)
    { return ::clReleaseDevice(device); }
};
#else // CL_HPP_TARGET_OPENCL_VERSION >= 120
/**
 * OpenCL 1.1 devices do not have retain/release.
 */
template <>
struct ReferenceHandler<cl_device_id>
{
    // cl_device_id does not have retain().
    static cl_int retain(cl_device_id)
    { return CL_SUCCESS; }
    // cl_device_id does not have release().
    static cl_int release(cl_device_id)
    { return CL_SUCCESS; }
};
#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120)

template <>
struct ReferenceHandler<cl_platform_id>
{
    // cl_platform_id does not have retain().
    static cl_int retain(cl_platform_id)
    { return CL_SUCCESS; }
    // cl_platform_id does not have release().
    static cl_int release(cl_platform_id)
    { return CL_SUCCESS; }
};

template <>
struct ReferenceHandler<cl_context>
{
    static cl_int retain(cl_context context)
    { return ::clRetainContext(context); }
    static cl_int release(cl_context context)
    { return ::clReleaseContext(context); }
};

template <>
struct ReferenceHandler<cl_command_queue>
{
    static cl_int retain(cl_command_queue queue)
    { return ::clRetainCommandQueue(queue); }
    static cl_int release(cl_command_queue queue)
    { return ::clReleaseCommandQueue(queue); }
};

template <>
struct ReferenceHandler<cl_mem>
{
    static cl_int retain(cl_mem memory)
    { return ::clRetainMemObject(memory); }
    static cl_int release(cl_mem memory)
    { return ::clReleaseMemObject(memory); }
};

template <>
struct ReferenceHandler<cl_sampler>
{
    static cl_int retain(cl_sampler sampler)
    { return ::clRetainSampler(sampler); }
    static cl_int release(cl_sampler sampler)
    { return ::clReleaseSampler(sampler); }
};

template <>
struct ReferenceHandler<cl_program>
{
    static cl_int retain(cl_program program)
    { return ::clRetainProgram(program); }
    static cl_int release(cl_program program)
    { return ::clReleaseProgram(program); }
};

template <>
struct ReferenceHandler<cl_kernel>
{
    static cl_int retain(cl_kernel kernel)
    { return ::clRetainKernel(kernel); }
    static cl_int release(cl_kernel kernel)
    { return ::clReleaseKernel(kernel); }
};

template <>
struct ReferenceHandler<cl_event>
{
    static cl_int retain(cl_event event)
    { return ::clRetainEvent(event); }
    static cl_int release(cl_event event)
    { return ::clReleaseEvent(event); }
};

#ifdef cl_khr_semaphore
template <>
struct ReferenceHandler<cl_semaphore_khr>
{
    static cl_int retain(cl_semaphore_khr semaphore)
    { 
        if (pfn_clRetainSemaphoreKHR != nullptr) {
            return pfn_clRetainSemaphoreKHR(semaphore);
        }

        return CL_INVALID_OPERATION;
    }

    static cl_int release(cl_semaphore_khr semaphore)
    {
        if (pfn_clReleaseSemaphoreKHR != nullptr) {
            return pfn_clReleaseSemaphoreKHR(semaphore);
        }

        return CL_INVALID_OPERATION;
    }
};
#endif // cl_khr_semaphore
#if defined(cl_khr_command_buffer)
template <>
struct ReferenceHandler<cl_command_buffer_khr>
{
    static cl_int retain(cl_command_buffer_khr cmdBufferKhr)
    {
        if (pfn_clRetainCommandBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION, __RETAIN_COMMAND_BUFFER_KHR_ERR);
        }
        return pfn_clRetainCommandBufferKHR(cmdBufferKhr);
    }

    static cl_int release(cl_command_buffer_khr cmdBufferKhr)
    {
        if (pfn_clReleaseCommandBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION, __RELEASE_COMMAND_BUFFER_KHR_ERR);
        }
        return pfn_clReleaseCommandBufferKHR(cmdBufferKhr);
    }
};

template <>
struct ReferenceHandler<cl_mutable_command_khr>
{
    // cl_mutable_command_khr does not have retain().
    static cl_int retain(cl_mutable_command_khr)
    { return CL_SUCCESS; }
    // cl_mutable_command_khr does not have release().
    static cl_int release(cl_mutable_command_khr)
    { return CL_SUCCESS; }
};
#endif // cl_khr_command_buffer


#if (CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120) || \
    (CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200)
// Extracts version number with major in the upper 16 bits, minor in the lower 16
static cl_uint getVersion(const vector<char> &versionInfo)
{
    int highVersion = 0;
    int lowVersion = 0;
    int index = 7;
    while(versionInfo[index] != '.' ) {
        highVersion *= 10;
        highVersion += versionInfo[index]-'0';
        ++index;
    }
    ++index;
    while(versionInfo[index] != ' ' &&  versionInfo[index] != '\0') {
        lowVersion *= 10;
        lowVersion += versionInfo[index]-'0';
        ++index;
    }
    return (highVersion << 16) | lowVersion;
}

static cl_uint getPlatformVersion(cl_platform_id platform)
{
    size_type size = 0;
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, nullptr, &size);

    vector<char> versionInfo(size);
    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size);
    return getVersion(versionInfo);
}

static cl_uint getDevicePlatformVersion(cl_device_id device)
{
    cl_platform_id platform;
    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, nullptr);
    return getPlatformVersion(platform);
}

static cl_uint getContextPlatformVersion(cl_context context)
{
    // The platform cannot be queried directly, so we first have to grab a
    // device and obtain its context
    size_type size = 0;
    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &size);
    if (size == 0)
        return 0;
    vector<cl_device_id> devices(size/sizeof(cl_device_id));
    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), nullptr);
    return getDevicePlatformVersion(devices[0]);
}
#endif // CL_HPP_TARGET_OPENCL_VERSION && CL_HPP_MINIMUM_OPENCL_VERSION

template <typename T>
class Wrapper
{
public:
    typedef T cl_type;

protected:
    cl_type object_;

public:
    Wrapper() : object_(nullptr) { }
    
    Wrapper(const cl_type &obj, bool retainObject) : object_(obj) 
    {
        if (retainObject) { 
            detail::errHandler(retain(), __RETAIN_ERR); 
        }
    }

    ~Wrapper()
    {
        if (object_ != nullptr) { release(); }
    }

    Wrapper(const Wrapper<cl_type>& rhs)
    {
        object_ = rhs.object_;
        detail::errHandler(retain(), __RETAIN_ERR);
    }

    Wrapper(Wrapper<cl_type>&& rhs) noexcept
    {
        object_ = rhs.object_;
        rhs.object_ = nullptr;
    }

    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
    {
        if (this != &rhs) {
            detail::errHandler(release(), __RELEASE_ERR);
            object_ = rhs.object_;
            detail::errHandler(retain(), __RETAIN_ERR);
        }
        return *this;
    }

    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
    {
        if (this != &rhs) {
            detail::errHandler(release(), __RELEASE_ERR);
            object_ = rhs.object_;
            rhs.object_ = nullptr;
        }
        return *this;
    }

    Wrapper<cl_type>& operator = (const cl_type &rhs)
    {
        detail::errHandler(release(), __RELEASE_ERR);
        object_ = rhs;
        return *this;
    }

    const cl_type& operator ()() const { return object_; }

    cl_type& operator ()() { return object_; }

    cl_type get() const { return object_; }

protected:
    template<typename Func, typename U>
    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);

    cl_int retain() const
    {
        if (object_ != nullptr) {
            return ReferenceHandler<cl_type>::retain(object_);
        }
        else {
            return CL_SUCCESS;
        }
    }

    cl_int release() const
    {
        if (object_ != nullptr) {
            return ReferenceHandler<cl_type>::release(object_);
        }
        else {
            return CL_SUCCESS;
        }
    }
};

template <>
class Wrapper<cl_device_id>
{
public:
    typedef cl_device_id cl_type;

protected:
    cl_type object_;
    bool referenceCountable_;

    static bool isReferenceCountable(cl_device_id device)
    {
        bool retVal = false;
#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
        if (device != nullptr) {
            int version = getDevicePlatformVersion(device);
            if(version > ((1 << 16) + 1)) {
                retVal = true;
            }
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
        retVal = true;
#endif // CL_HPP_TARGET_OPENCL_VERSION
        (void)device;
        return retVal;
    }

public:
    Wrapper() : object_(nullptr), referenceCountable_(false) 
    { 
    }
    
    Wrapper(const cl_type &obj, bool retainObject) : 
        object_(obj), 
        referenceCountable_(false) 
    {
        referenceCountable_ = isReferenceCountable(obj); 

        if (retainObject) {
            detail::errHandler(retain(), __RETAIN_ERR);
        }
    }

    ~Wrapper()
    {
        release();
    }
    
    Wrapper(const Wrapper<cl_type>& rhs)
    {
        object_ = rhs.object_;
        referenceCountable_ = isReferenceCountable(object_); 
        detail::errHandler(retain(), __RETAIN_ERR);
    }

    Wrapper(Wrapper<cl_type>&& rhs) noexcept
    {
        object_ = rhs.object_;
        referenceCountable_ = rhs.referenceCountable_;
        rhs.object_ = nullptr;
        rhs.referenceCountable_ = false;
    }

    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
    {
        if (this != &rhs) {
            detail::errHandler(release(), __RELEASE_ERR);
            object_ = rhs.object_;
            referenceCountable_ = rhs.referenceCountable_;
            detail::errHandler(retain(), __RETAIN_ERR);
        }
        return *this;
    }

    Wrapper<cl_type>& operator = (Wrapper<cl_type>&& rhs)
    {
        if (this != &rhs) {
            detail::errHandler(release(), __RELEASE_ERR);
            object_ = rhs.object_;
            referenceCountable_ = rhs.referenceCountable_;
            rhs.object_ = nullptr;
            rhs.referenceCountable_ = false;
        }
        return *this;
    }

    Wrapper<cl_type>& operator = (const cl_type &rhs)
    {
        detail::errHandler(release(), __RELEASE_ERR);
        object_ = rhs;
        referenceCountable_ = isReferenceCountable(object_); 
        return *this;
    }

    const cl_type& operator ()() const { return object_; }

    cl_type& operator ()() { return object_; }

    cl_type get() const { return object_; }

protected:
    template<typename Func, typename U>
    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);

    template<typename Func, typename U>
    friend inline cl_int getInfoHelper(Func, cl_uint, vector<U>*, int, typename U::cl_type);

    cl_int retain() const
    {
        if( object_ != nullptr && referenceCountable_ ) {
            return ReferenceHandler<cl_type>::retain(object_);
        }
        else {
            return CL_SUCCESS;
        }
    }

    cl_int release() const
    {
        if (object_ != nullptr && referenceCountable_) {
            return ReferenceHandler<cl_type>::release(object_);
        }
        else {
            return CL_SUCCESS;
        }
    }
};

template <typename T>
inline bool operator==(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
{
    return lhs() == rhs();
}

template <typename T>
inline bool operator!=(const Wrapper<T> &lhs, const Wrapper<T> &rhs)
{
    return !operator==(lhs, rhs);
}

} // namespace detail
//! \endcond


/*! \stuct ImageFormat
 *  \brief Adds constructors and member functions for cl_image_format.
 *
 *  \see cl_image_format
 */
struct ImageFormat : public cl_image_format
{
    //! \brief Default constructor - performs no initialization.
    ImageFormat(){}

    //! \brief Initializing constructor.
    ImageFormat(cl_channel_order order, cl_channel_type type)
    {
        image_channel_order = order;
        image_channel_data_type = type;
    }

    //! \brief Copy constructor.
    ImageFormat(const ImageFormat &other) { *this = other; }

    //! \brief Assignment operator.
    ImageFormat& operator = (const ImageFormat& rhs)
    {
        if (this != &rhs) {
            this->image_channel_data_type = rhs.image_channel_data_type;
            this->image_channel_order     = rhs.image_channel_order;
        }
        return *this;
    }
};

/*! \brief Class interface for cl_device_id.
 *
 *  \note Copies of these objects are inexpensive, since they don't 'own'
 *        any underlying resources or data structures.
 *
 *  \see cl_device_id
 */
class Device : public detail::Wrapper<cl_device_id>
{
private:
    static std::once_flag default_initialized_;
    static Device default_;
    static cl_int default_error_;

    /*! \brief Create the default context.
    *
    * This sets @c default_ and @c default_error_. It does not throw
    * @c cl::Error.
    */
    static void makeDefault();

    /*! \brief Create the default platform from a provided platform.
    *
    * This sets @c default_. It does not throw
    * @c cl::Error.
    */
    static void makeDefaultProvided(const Device &p) {
        default_ = p;
    }

public:
#ifdef CL_HPP_UNIT_TEST_ENABLE
    /*! \brief Reset the default.
    *
    * This sets @c default_ to an empty value to support cleanup in
    * the unit test framework.
    * This function is not thread safe.
    */
    static void unitTestClearDefault() {
        default_ = Device();
    }
#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE

    //! \brief Default constructor - initializes to nullptr.
    Device() : detail::Wrapper<cl_type>() { }

    /*! \brief Constructor from cl_device_id.
     * 
     *  This simply copies the device ID value, which is an inexpensive operation.
     */
    explicit Device(const cl_device_id &device, bool retainObject = false) : 
        detail::Wrapper<cl_type>(device, retainObject) { }

    /*! \brief Returns the first device on the default context.
     *
     *  \see Context::getDefault()
     */
    static Device getDefault(
        cl_int *errResult = nullptr)
    {
        std::call_once(default_initialized_, makeDefault);
        detail::errHandler(default_error_);
        if (errResult != nullptr) {
            *errResult = default_error_;
        }
        return default_;
    }

    /**
    * Modify the default device to be used by
    * subsequent operations.
    * Will only set the default if no default was previously created.
    * @return updated default device.
    *         Should be compared to the passed value to ensure that it was updated.
    */
    static Device setDefault(const Device &default_device)
    {
        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device));
        detail::errHandler(default_error_);
        return default_;
    }

    /*! \brief Assignment operator from cl_device_id.
     * 
     *  This simply copies the device ID value, which is an inexpensive operation.
     */
    Device& operator = (const cl_device_id& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }
 

    //! \brief Wrapper for clGetDeviceInfo().
    template <typename T>
    cl_int getInfo(cl_device_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
            __GET_DEVICE_INFO_ERR);
    }

    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
    template <cl_device_info name> typename
    detail::param_traits<detail::cl_device_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_device_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 210
    /**
     * Return the current value of the host clock as seen by the device.
     * The resolution of the device timer may be queried with the
     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
     * @return The host timer value.
     */
    cl_ulong getHostTimer(cl_int *error = nullptr)
    {
        cl_ulong retVal = 0;
        cl_int err = 
            clGetHostTimer(this->get(), &retVal);
        detail::errHandler(
            err,
            __GET_HOST_TIMER_ERR);
        if (error) {
            *error = err;
        }
        return retVal;
    }

    /**
     * Return a synchronized pair of host and device timestamps as seen by device.
     * Use to correlate the clocks and get the host timer only using getHostTimer
     * as a lower cost mechanism in between calls.
     * The resolution of the host timer may be queried with the 
     * CL_PLATFORM_HOST_TIMER_RESOLUTION query.
     * The resolution of the device timer may be queried with the
     * CL_DEVICE_PROFILING_TIMER_RESOLUTION query.
     * @return A pair of (device timer, host timer) timer values.
     */
    std::pair<cl_ulong, cl_ulong> getDeviceAndHostTimer(cl_int *error = nullptr)
    {
        std::pair<cl_ulong, cl_ulong> retVal;
        cl_int err =
            clGetDeviceAndHostTimer(this->get(), &(retVal.first), &(retVal.second));
        detail::errHandler(
            err,
            __GET_DEVICE_AND_HOST_TIMER_ERR);
        if (error) {
            *error = err;
        }
        return retVal;
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    //! \brief Wrapper for clCreateSubDevices().
    cl_int createSubDevices(const cl_device_partition_property* properties,
                            vector<Device>* devices);
#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)

#if defined(cl_ext_device_fission)
    //! \brief Wrapper for clCreateSubDevices().
    cl_int createSubDevices(const cl_device_partition_property_ext* properties,
                            vector<Device>* devices);
#endif // defined(cl_ext_device_fission)
};

using BuildLogType = vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, CL_PROGRAM_BUILD_LOG>::param_type>>;
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
/**
* Exception class for build errors to carry build info
*/
class BuildError : public Error
{
private:
    BuildLogType buildLogs;
public:
    BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec)
    {
    }

    BuildLogType getBuildLog() const
    {
        return buildLogs;
    }
};
namespace detail {
    static inline cl_int buildErrHandler(
        cl_int err,
        const char * errStr,
        const BuildLogType &buildLogs)
    {
        if (err != CL_SUCCESS) {
            throw BuildError(err, errStr, buildLogs);
        }
        return err;
    }
} // namespace detail

#else
namespace detail {
    static inline cl_int buildErrHandler(
        cl_int err,
        const char * errStr,
        const BuildLogType &buildLogs)
    {
        (void)buildLogs; // suppress unused variable warning
        (void)errStr;
        return err;
    }
} // namespace detail
#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_;
CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_;
CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS;

/*! \brief Class interface for cl_platform_id.
 *
 *  \note Copies of these objects are inexpensive, since they don't 'own'
 *        any underlying resources or data structures.
 *
 *  \see cl_platform_id
 */
class Platform : public detail::Wrapper<cl_platform_id>
{
private:
    static std::once_flag default_initialized_;
    static Platform default_;
    static cl_int default_error_;

    /*! \brief Create the default context.
    *
    * This sets @c default_ and @c default_error_. It does not throw
    * @c cl::Error.
    */
    static void makeDefault() {
        /* Throwing an exception from a call_once invocation does not do
        * what we wish, so we catch it and save the error.
        */
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        try
#endif
        {
            // If default wasn't passed ,generate one
            // Otherwise set it
            cl_uint n = 0;

            cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
            if (err != CL_SUCCESS) {
                default_error_ = err;
                return;
            }
            if (n == 0) {
                default_error_ = CL_INVALID_PLATFORM;
                return;
            }

            vector<cl_platform_id> ids(n);
            err = ::clGetPlatformIDs(n, ids.data(), nullptr);
            if (err != CL_SUCCESS) {
                default_error_ = err;
                return;
            }

            default_ = Platform(ids[0]);
        }
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        catch (cl::Error &e) {
            default_error_ = e.err();
        }
#endif
    }

    /*! \brief Create the default platform from a provided platform.
     *
     * This sets @c default_. It does not throw
     * @c cl::Error.
     */
    static void makeDefaultProvided(const Platform &p) {
       default_ = p;
    }
    
public:
#ifdef CL_HPP_UNIT_TEST_ENABLE
    /*! \brief Reset the default.
    *
    * This sets @c default_ to an empty value to support cleanup in
    * the unit test framework.
    * This function is not thread safe.
    */
    static void unitTestClearDefault() {
        default_ = Platform();
    }
#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE

    //! \brief Default constructor - initializes to nullptr.
    Platform() : detail::Wrapper<cl_type>()  { }

    /*! \brief Constructor from cl_platform_id.
     * 
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  This simply copies the platform ID value, which is an inexpensive operation.
     */
    explicit Platform(const cl_platform_id &platform, bool retainObject = false) : 
        detail::Wrapper<cl_type>(platform, retainObject) { }

    /*! \brief Assignment operator from cl_platform_id.
     * 
     *  This simply copies the platform ID value, which is an inexpensive operation.
     */
    Platform& operator = (const cl_platform_id& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    static Platform getDefault(
        cl_int *errResult = nullptr)
    {
        std::call_once(default_initialized_, makeDefault);
        detail::errHandler(default_error_);
        if (errResult != nullptr) {
            *errResult = default_error_;
        }
        return default_;
    }

    /**
     * Modify the default platform to be used by 
     * subsequent operations.
     * Will only set the default if no default was previously created.
     * @return updated default platform. 
     *         Should be compared to the passed value to ensure that it was updated.
     */
    static Platform setDefault(const Platform &default_platform)
    {
        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform));
        detail::errHandler(default_error_);
        return default_;
    }

    //! \brief Wrapper for clGetPlatformInfo().
    template <typename T>
    cl_int getInfo(cl_platform_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
            __GET_PLATFORM_INFO_ERR);
    }

    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
    template <cl_platform_info name> typename
    detail::param_traits<detail::cl_platform_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_platform_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    /*! \brief Gets a list of devices for this platform.
     * 
     *  Wraps clGetDeviceIDs().
     */
    cl_int getDevices(
        cl_device_type type,
        vector<Device>* devices) const
    {
        cl_uint n = 0;
        if( devices == nullptr ) {
            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
        }
        cl_int err = ::clGetDeviceIDs(object_, type, 0, nullptr, &n);
        if (err != CL_SUCCESS  && err != CL_DEVICE_NOT_FOUND) {
            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
        }

        vector<cl_device_id> ids(n);
        if (n>0) {
            err = ::clGetDeviceIDs(object_, type, n, ids.data(), nullptr);
            if (err != CL_SUCCESS) {
                return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
            }
        }

        // Cannot trivially assign because we need to capture intermediates 
        // with safe construction
        // We must retain things we obtain from the API to avoid releasing
        // API-owned objects.
        if (devices) {
            devices->resize(ids.size());

            // Assign to param, constructing with retain behaviour
            // to correctly capture each underlying CL object
            for (size_type i = 0; i < ids.size(); i++) {
                (*devices)[i] = Device(ids[i], true);
            }
        }
        return CL_SUCCESS;
    }

#if defined(CL_HPP_USE_DX_INTEROP)
   /*! \brief Get the list of available D3D10 devices.
     *
     *  \param d3d_device_source.
     *
     *  \param d3d_object.
     *
     *  \param d3d_device_set.
     *
     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
     *  values returned in devices can be used to identify a specific OpenCL
     *  device. If \a devices argument is nullptr, this argument is ignored.
     *
     *  \return One of the following values:
     *    - CL_SUCCESS if the function is executed successfully.
     *
     *  The application can query specific capabilities of the OpenCL device(s)
     *  returned by cl::getDevices. This can be used by the application to
     *  determine which device(s) to use.
     *
     * \note In the case that exceptions are enabled and a return value
     * other than CL_SUCCESS is generated, then cl::Error exception is
     * generated.
     */
    cl_int getDevices(
        cl_d3d10_device_source_khr d3d_device_source,
        void *                     d3d_object,
        cl_d3d10_device_set_khr    d3d_device_set,
        vector<Device>* devices) const
    {
        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
            cl_platform_id platform, 
            cl_d3d10_device_source_khr d3d_device_source, 
            void * d3d_object,
            cl_d3d10_device_set_khr d3d_device_set,
            cl_uint num_entries,
            cl_device_id * devices,
            cl_uint* num_devices);

        if( devices == nullptr ) {
            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
        }

        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = nullptr;
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR);
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetDeviceIDsFromD3D10KHR);
#endif

        cl_uint n = 0;
        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
            object_, 
            d3d_device_source, 
            d3d_object,
            d3d_device_set, 
            0, 
            nullptr, 
            &n);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
        }

        vector<cl_device_id> ids(n);
        err = pfn_clGetDeviceIDsFromD3D10KHR(
            object_, 
            d3d_device_source, 
            d3d_object,
            d3d_device_set,
            n, 
            ids.data(), 
            nullptr);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
        }

        // Cannot trivially assign because we need to capture intermediates 
        // with safe construction
        // We must retain things we obtain from the API to avoid releasing
        // API-owned objects.
        if (devices) {
            devices->resize(ids.size());

            // Assign to param, constructing with retain behaviour
            // to correctly capture each underlying CL object
            for (size_type i = 0; i < ids.size(); i++) {
                (*devices)[i] = Device(ids[i], true);
            }
        }
        return CL_SUCCESS;
    }
#endif

    /*! \brief Gets a list of available platforms.
     * 
     *  Wraps clGetPlatformIDs().
     */
    static cl_int get(
        vector<Platform>* platforms)
    {
        cl_uint n = 0;

        if( platforms == nullptr ) {
            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
        }

        cl_int err = ::clGetPlatformIDs(0, nullptr, &n);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
        }

        vector<cl_platform_id> ids(n);
        err = ::clGetPlatformIDs(n, ids.data(), nullptr);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
        }

        if (platforms) {
            platforms->resize(ids.size());

            // Platforms don't reference count
            for (size_type i = 0; i < ids.size(); i++) {
                (*platforms)[i] = Platform(ids[i]);
            }
        }
        return CL_SUCCESS;
    }

    /*! \brief Gets the first available platform.
     * 
     *  Wraps clGetPlatformIDs(), returning the first result.
     */
    static cl_int get(
        Platform * platform)
    {
        cl_int err;
        Platform default_platform = Platform::getDefault(&err);
        if (platform) {
            *platform = default_platform;
        }
        return err;
    }

    /*! \brief Gets the first available platform, returning it by value.
     *
     * \return Returns a valid platform if one is available.
     *         If no platform is available will return a null platform.
     * Throws an exception if no platforms are available
     * or an error condition occurs.
     * Wraps clGetPlatformIDs(), returning the first result.
     */
    static Platform get(
        cl_int * errResult = nullptr)
    {
        cl_int err;
        Platform default_platform = Platform::getDefault(&err);
        if (errResult) {
            *errResult = err;
        }
        return default_platform;
    }    
    
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    //! \brief Wrapper for clUnloadCompiler().
    cl_int
    unloadCompiler()
    {
        return ::clUnloadPlatformCompiler(object_);
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
}; // class Platform

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
   //! \brief Wrapper for clCreateSubDevices().
inline cl_int Device::createSubDevices(const cl_device_partition_property* properties,
                         vector<Device>* devices)
{
    cl_uint n = 0;
    cl_int err = clCreateSubDevices(object_, properties, 0, nullptr, &n);
    if (err != CL_SUCCESS)
    {
        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
    }

    vector<cl_device_id> ids(n);
    err = clCreateSubDevices(object_, properties, n, ids.data(), nullptr);
    if (err != CL_SUCCESS)
    {
        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
    }

    // Cannot trivially assign because we need to capture intermediates
    // with safe construction
    if (devices)
    {
        devices->resize(ids.size());

        // Assign to param, constructing with retain behaviour
        // to correctly capture each underlying CL object
        for (size_type i = 0; i < ids.size(); i++)
        {
            // We do not need to retain because this device is being created
            // by the runtime
            (*devices)[i] = Device(ids[i], false);
        }
    }

    return CL_SUCCESS;
}
#endif // defined (CL_HPP_TARGET_OPENCL_VERSION >= 120)

#if defined(cl_ext_device_fission)
   //! \brief Wrapper for clCreateSubDevices().
inline cl_int Device::createSubDevices(const cl_device_partition_property_ext* properties,
                        vector<Device>* devices)
{
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    cl::Device device(object_);
    cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
    CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSubDevicesEXT);
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
    CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT);
#endif

    cl_uint n = 0;
    cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, nullptr, &n);
    if (err != CL_SUCCESS)
    {
        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
    }

    vector<cl_device_id> ids(n);
    err =
        pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), nullptr);
    if (err != CL_SUCCESS)
    {
        return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR);
    }
    // Cannot trivially assign because we need to capture intermediates
    // with safe construction
    if (devices)
    {
        devices->resize(ids.size());

        // Assign to param, constructing with retain behaviour
        // to correctly capture each underlying CL object
        for (size_type i = 0; i < ids.size(); i++)
        {
            // We do not need to retain because this device is being created
            // by the runtime
            (*devices)[i] = Device(ids[i], false);
        }
    }

    return CL_SUCCESS;
}
#endif // defined(cl_ext_device_fission)

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_;
CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_;
CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS;


/**
 * Deprecated APIs for 1.2
 */
#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
/**
 * Unload the OpenCL compiler.
 * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
 */
inline CL_API_PREFIX__VERSION_1_1_DEPRECATED cl_int
UnloadCompiler() CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
inline cl_int
UnloadCompiler()
{
    return ::clUnloadCompiler();
}
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)


#if defined(cl_ext_image_requirements_info)
enum ImageRequirementsInfoExt : cl_image_requirements_info_ext
{
    RowPitchAlign = CL_IMAGE_REQUIREMENTS_ROW_PITCH_ALIGNMENT_EXT,
    BaseAddAlign = CL_IMAGE_REQUIREMENTS_BASE_ADDRESS_ALIGNMENT_EXT,
    Size = CL_IMAGE_REQUIREMENTS_SIZE_EXT,
    MaxWidth = CL_IMAGE_REQUIREMENTS_MAX_WIDTH_EXT,
    MaxHeight = CL_IMAGE_REQUIREMENTS_MAX_HEIGHT_EXT,
    MaxDepth = CL_IMAGE_REQUIREMENTS_MAX_DEPTH_EXT,
    MaxArraySize = CL_IMAGE_REQUIREMENTS_MAX_ARRAY_SIZE_EXT,
#if defined(cl_ext_image_from_buffer)
    SlicePitchAlign = CL_IMAGE_REQUIREMENTS_SLICE_PITCH_ALIGNMENT_EXT,
#endif
};

#endif // cl_ext_image_requirements_info


/*! \brief Class interface for cl_context.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_context as the original.  For details, see
 *        clRetainContext() and clReleaseContext().
 *
 *  \see cl_context
 */
class Context 
    : public detail::Wrapper<cl_context>
{
private:
    static std::once_flag default_initialized_;
    static Context default_;
    static cl_int default_error_;

    /*! \brief Create the default context from the default device type in the default platform.
     *
     * This sets @c default_ and @c default_error_. It does not throw
     * @c cl::Error.
     */
    static void makeDefault() {
        /* Throwing an exception from a call_once invocation does not do
         * what we wish, so we catch it and save the error.
         */
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        try
#endif
        {
#if !defined(__APPLE__) && !defined(__MACOS)
            const Platform &p = Platform::getDefault();
            cl_platform_id defaultPlatform = p();
            cl_context_properties properties[3] = {
                CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0
            };
#else // #if !defined(__APPLE__) && !defined(__MACOS)
            cl_context_properties *properties = nullptr;
#endif // #if !defined(__APPLE__) && !defined(__MACOS)

            default_ = Context(
                CL_DEVICE_TYPE_DEFAULT,
                properties,
                nullptr,
                nullptr,
                &default_error_);
        }
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        catch (cl::Error &e) {
            default_error_ = e.err();
        }
#endif
    }


    /*! \brief Create the default context from a provided Context.
     *
     * This sets @c default_. It does not throw
     * @c cl::Error.
     */
    static void makeDefaultProvided(const Context &c) {
        default_ = c;
    }

#if defined(cl_ext_image_requirements_info)
    struct ImageRequirementsInfo {

        ImageRequirementsInfo(cl_mem_flags f, const cl_mem_properties* mem_properties, const ImageFormat* format, const cl_image_desc* desc)
        {
            flags = f;
            properties = mem_properties;
            image_format = format;
            image_desc = desc;
        }

        cl_mem_flags flags = 0;
        const cl_mem_properties* properties;
        const ImageFormat* image_format;
        const cl_image_desc* image_desc;
    };

    static cl_int getImageRequirementsInfoExtHelper(const Context &context,
        const ImageRequirementsInfo &info,
        cl_image_requirements_info_ext param_name,
        size_type param_value_size,
        void* param_value,
        size_type* param_value_size_ret)
    {

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetImageRequirementsInfoEXT);
#else
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetImageRequirementsInfoEXT);
#endif

        if (pfn_clGetImageRequirementsInfoEXT == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION, __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
        }

        return detail::errHandler(
            pfn_clGetImageRequirementsInfoEXT(context(), info.properties,
                info.flags, info.image_format, info.image_desc, param_name,
                param_value_size, param_value, param_value_size_ret),
            __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
    }
#endif // cl_ext_image_requirements_info
    
public:
#ifdef CL_HPP_UNIT_TEST_ENABLE
    /*! \brief Reset the default.
    *
    * This sets @c default_ to an empty value to support cleanup in
    * the unit test framework.
    * This function is not thread safe.
    */
    static void unitTestClearDefault() {
        default_ = Context();
    }
#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE

    /*! \brief Constructs a context including a list of specified devices.
     *
     *  Wraps clCreateContext().
     */
    Context(
        const vector<Device>& devices,
        const cl_context_properties* properties = nullptr,
        void (CL_CALLBACK * notifyFptr)(
            const char *,
            const void *,
            size_type,
            void *) = nullptr,
        void* data = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        size_type numDevices = devices.size();
        vector<cl_device_id> deviceIDs(numDevices);

        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
            deviceIDs[deviceIndex] = (devices[deviceIndex])();
        }

        object_ = ::clCreateContext(
            properties, (cl_uint) numDevices,
            deviceIDs.data(),
            notifyFptr, data, &error);

        detail::errHandler(error, __CREATE_CONTEXT_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /*! \brief Constructs a context including a specific device.
     *
     *  Wraps clCreateContext().
     */
    Context(
        const Device& device,
        const cl_context_properties* properties = nullptr,
        void (CL_CALLBACK * notifyFptr)(
            const char *,
            const void *,
            size_type,
            void *) = nullptr,
        void* data = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_device_id deviceID = device();

        object_ = ::clCreateContext(
            properties, 1,
            &deviceID,
            notifyFptr, data, &error);

        detail::errHandler(error, __CREATE_CONTEXT_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
    
    /*! \brief Constructs a context including all or a subset of devices of a specified type.
     *
     *  Wraps clCreateContextFromType().
     */
    Context(
        cl_device_type type,
        const cl_context_properties* properties = nullptr,
        void (CL_CALLBACK * notifyFptr)(
            const char *,
            const void *,
            size_type,
            void *) = nullptr,
        void* data = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

#if !defined(__APPLE__) && !defined(__MACOS)
        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };

        if (properties == nullptr) {
            // Get a valid platform ID as we cannot send in a blank one
            vector<Platform> platforms;
            error = Platform::get(&platforms);
            if (error != CL_SUCCESS) {
                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
                if (err != nullptr) {
                    *err = error;
                }
                return;
            }

            // Check the platforms we found for a device of our specified type
            cl_context_properties platform_id = 0;
            for (unsigned int i = 0; i < platforms.size(); i++) {

                vector<Device> devices;

#if defined(CL_HPP_ENABLE_EXCEPTIONS)
                try {
#endif

                    error = platforms[i].getDevices(type, &devices);

#if defined(CL_HPP_ENABLE_EXCEPTIONS)
                } catch (cl::Error& e) {
                    error = e.err();
                }
    // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type
    // We do error checking next anyway, and can throw there if needed
#endif

                // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND
                if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) {
                    detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
                    if (err != nullptr) {
                        *err = error;
                    }
                }

                if (devices.size() > 0) {
                    platform_id = (cl_context_properties)platforms[i]();
                    break;
                }
            }

            if (platform_id == 0) {
                detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR);
                if (err != nullptr) {
                    *err = CL_DEVICE_NOT_FOUND;
                }
                return;
            }

            prop[1] = platform_id;
            properties = &prop[0];
        }
#endif
        object_ = ::clCreateContextFromType(
            properties, type, notifyFptr, data, &error);

        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }


    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
     *
     *  \note All calls to this function return the same cl_context as the first.
     */
    static Context getDefault(cl_int * err = nullptr) 
    {
        std::call_once(default_initialized_, makeDefault);
        detail::errHandler(default_error_);
        if (err != nullptr) {
            *err = default_error_;
        }
        return default_;
    }

    /**
     * Modify the default context to be used by
     * subsequent operations.
     * Will only set the default if no default was previously created.
     * @return updated default context.
     *         Should be compared to the passed value to ensure that it was updated.
     */
    static Context setDefault(const Context &default_context)
    {
        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context));
        detail::errHandler(default_error_);
        return default_;
    }

    //! \brief Default constructor - initializes to nullptr.
    Context() : detail::Wrapper<cl_type>() { }

    /*! \brief Constructor from cl_context - takes ownership.
     * 
     *  This effectively transfers ownership of a refcount on the cl_context
     *  into the new Context object.
     */
    explicit Context(const cl_context& context, bool retainObject = false) : 
        detail::Wrapper<cl_type>(context, retainObject) { }

    /*! \brief Assignment operator from cl_context - takes ownership.
     * 
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseContext() on the value previously held by this instance.
     */
    Context& operator = (const cl_context& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    //! \brief Wrapper for clGetContextInfo().
    template <typename T>
    cl_int getInfo(cl_context_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetContextInfo, object_, name, param),
            __GET_CONTEXT_INFO_ERR);
    }

    //! \brief Wrapper for clGetContextInfo() that returns by value.
    template <cl_context_info name> typename
    detail::param_traits<detail::cl_context_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_context_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    /*! \brief Gets a list of supported image formats.
     *  
     *  Wraps clGetSupportedImageFormats().
     */
    cl_int getSupportedImageFormats(
        cl_mem_flags flags,
        cl_mem_object_type type,
        vector<ImageFormat>* formats) const
    {
        cl_uint numEntries;
        
        if (!formats) {
            return CL_SUCCESS;
        }

        cl_int err = ::clGetSupportedImageFormats(
           object_, 
           flags,
           type, 
           0, 
           nullptr, 
           &numEntries);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
        }

        if (numEntries > 0) {
            vector<ImageFormat> value(numEntries);
            err = ::clGetSupportedImageFormats(
                object_,
                flags,
                type,
                numEntries,
                (cl_image_format*)value.data(),
                nullptr);
            if (err != CL_SUCCESS) {
                return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
            }

            formats->assign(value.begin(), value.end());
        }
        else {
            // If no values are being returned, ensure an empty vector comes back
            formats->clear();
        }

        return CL_SUCCESS;
    }

#if defined(cl_ext_image_requirements_info)
    template <typename T>
    cl_int getImageRequirementsInfoExt(cl_image_requirements_info_ext name,
        T* param,
        cl_mem_flags flags = 0,
        const cl_mem_properties* properties = nullptr,
        const ImageFormat* image_format = nullptr,
        const cl_image_desc* image_desc = nullptr) const
    {
        ImageRequirementsInfo imageInfo = {flags, properties, image_format, image_desc};

        return detail::errHandler(
            detail::getInfo(
                Context::getImageRequirementsInfoExtHelper, *this, imageInfo, name, param),
                __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR);
    }

    template <cl_image_requirements_info_ext type> typename
    detail::param_traits<detail::cl_image_requirements_info_ext, type>::param_type
        getImageRequirementsInfoExt(cl_mem_flags flags = 0,
            const cl_mem_properties* properties = nullptr,
            const ImageFormat* image_format = nullptr,
            const cl_image_desc* image_desc = nullptr,
            cl_int* err = nullptr) const
    {
        typename detail::param_traits<
        detail::cl_image_requirements_info_ext, type>::param_type param;
        cl_int result = getImageRequirementsInfoExt(type, &param, flags, properties, image_format, image_desc);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
#endif // cl_ext_image_requirements_info

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief  Registers a destructor callback function with a context.
     *
     *  Wraps clSetContextDestructorCallback().
     * 
     * Each call to this function registers the specified callback function on
     * a destructor callback stack associated with context. The registered
     * callback functions are called in the reverse order in which they were registered.
     * If a context callback function was specified when context was created,
     * it will not be called after any context destructor callback is called.
     */
    cl_int setDestructorCallback(
        void (CL_CALLBACK * pfn_notify)(cl_context, void *),
        void * user_data = nullptr)
    {
        return detail::errHandler(
            ::clSetContextDestructorCallback(
                object_,
                pfn_notify,
                user_data),
                __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR);
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 300
};

inline void Device::makeDefault()
{
    /* Throwing an exception from a call_once invocation does not do
    * what we wish, so we catch it and save the error.
    */
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    try
#endif
    {
        cl_int error = 0;

        Context context = Context::getDefault(&error);
        detail::errHandler(error, __CREATE_CONTEXT_ERR);

        if (error != CL_SUCCESS) {
            default_error_ = error;
        }
        else {
            default_ = context.getInfo<CL_CONTEXT_DEVICES>()[0];
            default_error_ = CL_SUCCESS;
        }
    }
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    catch (cl::Error &e) {
        default_error_ = e.err();
    }
#endif
}

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_;
CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_;
CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS;

/*! \brief Class interface for cl_event.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_event as the original.  For details, see
 *        clRetainEvent() and clReleaseEvent().
 *
 *  \see cl_event
 */
class Event : public detail::Wrapper<cl_event>
{
public:
    //! \brief Default constructor - initializes to nullptr.
    Event() : detail::Wrapper<cl_type>() { }

    /*! \brief Constructor from cl_event - takes ownership.
     * 
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  This effectively transfers ownership of a refcount on the cl_event
     *  into the new Event object.
     */
    explicit Event(const cl_event& event, bool retainObject = false) : 
        detail::Wrapper<cl_type>(event, retainObject) { }

    /*! \brief Assignment operator from cl_event - takes ownership.
     *
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseEvent() on the value previously held by this instance.
     */
    Event& operator = (const cl_event& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    //! \brief Wrapper for clGetEventInfo().
    template <typename T>
    cl_int getInfo(cl_event_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetEventInfo, object_, name, param),
            __GET_EVENT_INFO_ERR);
    }

    //! \brief Wrapper for clGetEventInfo() that returns by value.
    template <cl_event_info name> typename
    detail::param_traits<detail::cl_event_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_event_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    //! \brief Wrapper for clGetEventProfilingInfo().
    template <typename T>
    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
    {
        return detail::errHandler(detail::getInfo(
            &::clGetEventProfilingInfo, object_, name, param),
            __GET_EVENT_PROFILE_INFO_ERR);
    }

    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
    template <cl_profiling_info name> typename
    detail::param_traits<detail::cl_profiling_info, name>::param_type
    getProfilingInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_profiling_info, name>::param_type param;
        cl_int result = getProfilingInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    /*! \brief Blocks the calling thread until this event completes.
     * 
     *  Wraps clWaitForEvents().
     */
    cl_int wait() const
    {
        return detail::errHandler(
            ::clWaitForEvents(1, &object_),
            __WAIT_FOR_EVENTS_ERR);
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 110
    /*! \brief Registers a user callback function for a specific command execution status.
     *
     *  Wraps clSetEventCallback().
     */
    cl_int setCallback(
        cl_int type,
        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),
        void * user_data = nullptr)
    {
        return detail::errHandler(
            ::clSetEventCallback(
                object_,
                type,
                pfn_notify,
                user_data), 
            __SET_EVENT_CALLBACK_ERR);
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110

    /*! \brief Blocks the calling thread until every event specified is complete.
     * 
     *  Wraps clWaitForEvents().
     */
    static cl_int
    waitForEvents(const vector<Event>& events)
    {
        static_assert(sizeof(cl::Event) == sizeof(cl_event),
        "Size of cl::Event must be equal to size of cl_event");

        return detail::errHandler(
            ::clWaitForEvents(
                (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
            __WAIT_FOR_EVENTS_ERR);
    }
};

#if CL_HPP_TARGET_OPENCL_VERSION >= 110
/*! \brief Class interface for user events (a subset of cl_event's).
 * 
 *  See Event for details about copy semantics, etc.
 */
class UserEvent : public Event
{
public:
    /*! \brief Constructs a user event on a given context.
     *
     *  Wraps clCreateUserEvent().
     */
    UserEvent(
        const Context& context,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateUserEvent(
            context(),
            &error);

        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    UserEvent() : Event() { }

    /*! \brief Sets the execution status of a user event object.
     *
     *  Wraps clSetUserEventStatus().
     */
    cl_int setStatus(cl_int status)
    {
        return detail::errHandler(
            ::clSetUserEventStatus(object_,status), 
            __SET_USER_EVENT_STATUS_ERR);
    }
};
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110

/*! \brief Blocks the calling thread until every event specified is complete.
 * 
 *  Wraps clWaitForEvents().
 */
inline static cl_int
WaitForEvents(const vector<Event>& events)
{
    return detail::errHandler(
        ::clWaitForEvents(
            (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : nullptr),
        __WAIT_FOR_EVENTS_ERR);
}

/*! \brief Class interface for cl_mem.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_mem as the original.  For details, see
 *        clRetainMemObject() and clReleaseMemObject().
 *
 *  \see cl_mem
 */
class Memory : public detail::Wrapper<cl_mem>
{
public:
    //! \brief Default constructor - initializes to nullptr.
    Memory() : detail::Wrapper<cl_type>() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     *  Optionally transfer ownership of a refcount on the cl_mem
     *  into the new Memory object.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *
     *  See Memory for further details.
     */
    explicit Memory(const cl_mem& memory, bool retainObject) :
        detail::Wrapper<cl_type>(memory, retainObject) { }

    /*! \brief Assignment operator from cl_mem - takes ownership.
     *
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseMemObject() on the value previously held by this instance.
     */
    Memory& operator = (const cl_mem& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    //! \brief Wrapper for clGetMemObjectInfo().
    template <typename T>
    cl_int getInfo(cl_mem_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
            __GET_MEM_OBJECT_INFO_ERR);
    }

    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
    template <cl_mem_info name> typename
    detail::param_traits<detail::cl_mem_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_mem_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 110
    /*! \brief Registers a callback function to be called when the memory object
     *         is no longer needed.
     *
     *  Wraps clSetMemObjectDestructorCallback().
     *
     *  Repeated calls to this function, for a given cl_mem value, will append
     *  to the list of functions called (in reverse order) when memory object's
     *  resources are freed and the memory object is deleted.
     *
     *  \note
     *  The registered callbacks are associated with the underlying cl_mem
     *  value - not the Memory class instance.
     */
    cl_int setDestructorCallback(
        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),
        void * user_data = nullptr)
    {
        return detail::errHandler(
            ::clSetMemObjectDestructorCallback(
                object_,
                pfn_notify,
                user_data), 
            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110

};

// Pre-declare copy functions
class Buffer;
template< typename IteratorType >
cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
template< typename IteratorType >
cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
template< typename IteratorType >
cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
template< typename IteratorType >
cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
namespace detail
{
    class SVMTraitNull
    {
    public:
        static cl_svm_mem_flags getSVMMemFlags()
        {
            return 0;
        }
    };
} // namespace detail

template<class Trait = detail::SVMTraitNull>
class SVMTraitReadWrite
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return CL_MEM_READ_WRITE |
            Trait::getSVMMemFlags();
    }
};

template<class Trait = detail::SVMTraitNull>
class SVMTraitReadOnly
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return CL_MEM_READ_ONLY |
            Trait::getSVMMemFlags();
    }
};

template<class Trait = detail::SVMTraitNull>
class SVMTraitWriteOnly
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return CL_MEM_WRITE_ONLY |
            Trait::getSVMMemFlags();
    }
};

template<class Trait = SVMTraitReadWrite<>>
class SVMTraitCoarse
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return Trait::getSVMMemFlags();
    }
};

template<class Trait = SVMTraitReadWrite<>>
class SVMTraitFine
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return CL_MEM_SVM_FINE_GRAIN_BUFFER |
            Trait::getSVMMemFlags();
    }
};

template<class Trait = SVMTraitReadWrite<>>
class SVMTraitAtomic
{
public:
    static cl_svm_mem_flags getSVMMemFlags()
    {
        return
            CL_MEM_SVM_FINE_GRAIN_BUFFER |
            CL_MEM_SVM_ATOMICS |
            Trait::getSVMMemFlags();
    }
};

// Pre-declare SVM map function
template<typename T>
inline cl_int enqueueMapSVM(
    T* ptr,
    cl_bool blocking,
    cl_map_flags flags,
    size_type size,
    const vector<Event>* events = nullptr,
    Event* event = nullptr);

/**
 * STL-like allocator class for managing SVM objects provided for convenience.
 *
 * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects,
 * care must be taken when using with smart pointers.
 * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because
 * the coarse-grained management behaviour would behave incorrectly with respect to reference counting.
 *
 * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used
 * with the allocate_shared and allocate_ptr supplied operations.
 */
template<typename T, class SVMTrait>
class SVMAllocator {
private:
    Context context_;

public:
    typedef T value_type;
    typedef value_type* pointer;
    typedef const value_type* const_pointer;
    typedef value_type& reference;
    typedef const value_type& const_reference;
    typedef std::size_t size_type;
    typedef std::ptrdiff_t difference_type;

    template<typename U>
    struct rebind
    {
        typedef SVMAllocator<U, SVMTrait> other;
    };

    template<typename U, typename V>
    friend class SVMAllocator;

    SVMAllocator() :
        context_(Context::getDefault())
    {
    }

    explicit SVMAllocator(cl::Context context) :
        context_(context)
    {
    }


    SVMAllocator(const SVMAllocator &other) :
        context_(other.context_)
    {
    }

    template<typename U>
    SVMAllocator(const SVMAllocator<U, SVMTrait> &other) :
        context_(other.context_)
    {
    }

    ~SVMAllocator()
    {
    }

    pointer address(reference r) noexcept
    {
        return std::addressof(r);
    }

    const_pointer address(const_reference r) noexcept
    {
        return std::addressof(r);
    }

    /**
     * Allocate an SVM pointer.
     *
     * If the allocator is coarse-grained, this will take ownership to allow
     * containers to correctly construct data in place. 
     */
    pointer allocate(
        size_type size,
        typename cl::SVMAllocator<void, SVMTrait>::const_pointer = 0,
        bool map = true)
    {
        // Allocate memory with default alignment matching the size of the type
        void* voidPointer =
            clSVMAlloc(
            context_(),
            SVMTrait::getSVMMemFlags(),
            size*sizeof(T),
            0);
        pointer retValue = reinterpret_cast<pointer>(
            voidPointer);
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        if (!retValue) {
            std::bad_alloc excep;
            throw excep;
        }
#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS)

        // If allocation was coarse-grained then map it
        if (map && !(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) {
            cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T));
            if (err != CL_SUCCESS) {
                clSVMFree(context_(), retValue);
                retValue = nullptr;
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
                std::bad_alloc excep;
                throw excep;
#endif
            }
        }

        // If exceptions disabled, return null pointer from allocator
        return retValue;
    }

    void deallocate(pointer p, size_type)
    {
        clSVMFree(context_(), p);
    }

    /**
     * Return the maximum possible allocation size.
     * This is the minimum of the maximum sizes of all devices in the context.
     */
    size_type max_size() const noexcept
    {
        size_type maxSize = std::numeric_limits<size_type>::max() / sizeof(T);

        for (const Device &d : context_.getInfo<CL_CONTEXT_DEVICES>()) {
            maxSize = std::min(
                maxSize, 
                static_cast<size_type>(d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>()));
        }

        return maxSize;
    }

    template< class U, class... Args >
    void construct(U* p, Args&&... args)
    {
        new(p)T(args...);
    }

    template< class U >
    void destroy(U* p)
    {
        p->~U();
    }

    /**
     * Returns true if the contexts match.
     */
    inline bool operator==(SVMAllocator const& rhs)
    {
        return (context_==rhs.context_);
    }

    inline bool operator!=(SVMAllocator const& a)
    {
        return !operator==(a);
    }
}; // class SVMAllocator        return cl::pointer<T>(tmp, detail::Deleter<T, Alloc>{alloc, copies});


template<class SVMTrait>
class SVMAllocator<void, SVMTrait> {
public:
    typedef void value_type;
    typedef value_type* pointer;
    typedef const value_type* const_pointer;

    template<typename U>
    struct rebind
    {
        typedef SVMAllocator<U, SVMTrait> other;
    };

    template<typename U, typename V>
    friend class SVMAllocator;
};

#if !defined(CL_HPP_NO_STD_UNIQUE_PTR)
namespace detail
{
    template<class Alloc>
    class Deleter {
    private:
        Alloc alloc_;
        size_type copies_;

    public:
        typedef typename std::allocator_traits<Alloc>::pointer pointer;

        Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies }
        {
        }

        void operator()(pointer ptr) const {
            Alloc tmpAlloc{ alloc_ };
            std::allocator_traits<Alloc>::destroy(tmpAlloc, std::addressof(*ptr));
            std::allocator_traits<Alloc>::deallocate(tmpAlloc, ptr, copies_);
        }
    };
} // namespace detail

/**
 * Allocation operation compatible with std::allocate_ptr.
 * Creates a unique_ptr<T> by default.
 * This requirement is to ensure that the control block is not
 * allocated in memory inaccessible to the host.
 */
template <class T, class Alloc, class... Args>
cl::pointer<T, detail::Deleter<Alloc>> allocate_pointer(const Alloc &alloc_, Args&&... args)
{
    Alloc alloc(alloc_);
    static const size_type copies = 1;

    // Ensure that creation of the management block and the
    // object are dealt with separately such that we only provide a deleter

    T* tmp = std::allocator_traits<Alloc>::allocate(alloc, copies);
    if (!tmp) {
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        std::bad_alloc excep;
        throw excep;
#else
        return nullptr;
#endif
    }

#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    try
#endif
    {
        std::allocator_traits<Alloc>::construct(
            alloc,
            std::addressof(*tmp),
            std::forward<Args>(args)...);

        return cl::pointer<T, detail::Deleter<Alloc>>(tmp, detail::Deleter<Alloc>{alloc, copies});
    }
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
    catch (std::bad_alloc&)
    {
        std::allocator_traits<Alloc>::deallocate(alloc, tmp, copies);
        throw;
    }
#endif
}

template< class T, class SVMTrait, class... Args >
cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(Args... args)
{
    SVMAllocator<T, SVMTrait> alloc;
    return cl::allocate_pointer<T>(alloc, args...);
}

template< class T, class SVMTrait, class... Args >
cl::pointer<T, detail::Deleter<SVMAllocator<T, SVMTrait>>> allocate_svm(const cl::Context &c, Args... args)
{
    SVMAllocator<T, SVMTrait> alloc(c);
    return cl::allocate_pointer<T>(alloc, args...);
}
#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR)

/*! \brief Vector alias to simplify contruction of coarse-grained SVM containers.
 * 
 */
template < class T >
using coarse_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitCoarse<>>>;

/*! \brief Vector alias to simplify contruction of fine-grained SVM containers.
*
*/
template < class T >
using fine_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitFine<>>>;

/*! \brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics.
*
*/
template < class T >
using atomic_svm_vector = vector<T, cl::SVMAllocator<int, cl::SVMTraitAtomic<>>>;

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200


/*! \brief Class interface for Buffer Memory Objects.
 * 
 *  See Memory for details about copy semantics, etc.
 *
 *  \see Memory
 */
class Buffer : public Memory
{
public:

    /*! \brief Constructs a Buffer in a specified context.
     *
     *  Wraps clCreateBuffer().
     *
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified.  Note alignment & exclusivity requirements.
     */
    Buffer(
        const Context& context,
        cl_mem_flags flags,
        size_type size,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);

        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Buffer in a specified context and with specified properties.
     *
     *  Wraps clCreateBufferWithProperties().
     *
     *  \param properties Optional list of properties for the buffer object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0. 
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Buffer(
        const Context& context,
        const vector<cl_mem_properties>& properties,
        cl_mem_flags flags,
        size_type size,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        if (properties.empty()) {
            object_ = ::clCreateBufferWithProperties(context(), nullptr, flags,
                                                     size, host_ptr, &error);
        }
        else {
            object_ = ::clCreateBufferWithProperties(
                context(), properties.data(), flags, size, host_ptr, &error);
        }

        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
#endif

    /*! \brief Constructs a Buffer in the default context.
     *
     *  Wraps clCreateBuffer().
     *
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified.  Note alignment & exclusivity requirements.
     *
     *  \see Context::getDefault()
     */
    Buffer(
        cl_mem_flags flags,
        size_type size,
        void* host_ptr = nullptr,
        cl_int* err = nullptr) : Buffer(Context::getDefault(err), flags, size, host_ptr, err) { }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Buffer in the default context and with specified properties.
     *
     *  Wraps clCreateBufferWithProperties().
     *
     *  \param properties Optional list of properties for the buffer object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0. 
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     * 
     *  \see Context::getDefault()
     */
    Buffer(
        const vector<cl_mem_properties>& properties,
        cl_mem_flags flags,
        size_type size,
        void* host_ptr = nullptr,
        cl_int* err = nullptr) : Buffer(Context::getDefault(err), properties, flags, size, host_ptr, err) { }
#endif

    /*!
     * \brief Construct a Buffer from a host container via iterators.
     * IteratorType must be random access.
     * If useHostPtr is specified iterators must represent contiguous data.
     */
    template< typename IteratorType >
    Buffer(
        IteratorType startIterator,
        IteratorType endIterator,
        bool readOnly,
        bool useHostPtr = false,
        cl_int* err = nullptr)
    {
        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
        cl_int error;

        cl_mem_flags flags = 0;
        if( readOnly ) {
            flags |= CL_MEM_READ_ONLY;
        }
        else {
            flags |= CL_MEM_READ_WRITE;
        }
        if( useHostPtr ) {
            flags |= CL_MEM_USE_HOST_PTR;
        }
        
        size_type size = sizeof(DataType)*(endIterator - startIterator);

        Context context = Context::getDefault(err);

        if( useHostPtr ) {
            object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
        } else {
            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
        }

        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }

        if( !useHostPtr ) {
            error = cl::copy(startIterator, endIterator, *this);
            detail::errHandler(error, __CREATE_BUFFER_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
    }

    /*!
     * \brief Construct a Buffer from a host container via iterators using a specified context.
     * IteratorType must be random access.
     * If useHostPtr is specified iterators must represent contiguous data.
     */
    template< typename IteratorType >
    Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator,
        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);
    
    /*!
    * \brief Construct a Buffer from a host container via iterators using a specified queue.
    * If useHostPtr is specified iterators must be random access.
    */
    template< typename IteratorType >
    Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator,
        bool readOnly, bool useHostPtr = false, cl_int* err = nullptr);

    //! \brief Default constructor - initializes to nullptr.
    Buffer() : Memory() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with earlier versions.
     *
     *  See Memory for further details.
     */
    explicit Buffer(const cl_mem& buffer, bool retainObject = false) :
        Memory(buffer, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
    *
    *  See Memory for further details.
    */
    Buffer& operator = (const cl_mem& rhs)
    {
        Memory::operator=(rhs);
        return *this;
    }


#if CL_HPP_TARGET_OPENCL_VERSION >= 110
    /*! \brief Creates a new buffer object from this.
     *
     *  Wraps clCreateSubBuffer().
     */
    Buffer createSubBuffer(
        cl_mem_flags flags,
        cl_buffer_create_type buffer_create_type,
        const void * buffer_create_info,
        cl_int * err = nullptr)
    {
        Buffer result;
        cl_int error;
        result.object_ = ::clCreateSubBuffer(
            object_, 
            flags, 
            buffer_create_type, 
            buffer_create_info, 
            &error);

        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }

        return result;
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
};

#if defined (CL_HPP_USE_DX_INTEROP)
/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
 *
 *  This is provided to facilitate interoperability with Direct3D.
 * 
 *  See Memory for details about copy semantics, etc.
 *
 *  \see Memory
 */
class BufferD3D10 : public Buffer
{
public:
   

    /*! \brief Constructs a BufferD3D10, in a specified context, from a
     *         given ID3D10Buffer.
     *
     *  Wraps clCreateFromD3D10BufferKHR().
     */
    BufferD3D10(
        const Context& context,
        cl_mem_flags flags,
        ID3D10Buffer* bufobj,
        cl_int * err = nullptr) : pfn_clCreateFromD3D10BufferKHR(nullptr)
    {
        typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
            cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
            cl_int* errcode_ret);
        PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR;
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
        cl_platform platform = nullptr;
        for( int i = 0; i < props.size(); ++i ) {
            if( props[i] == CL_CONTEXT_PLATFORM ) {
                platform = props[i+1];
            }
        }
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR);
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR);
#endif

        cl_int error;
        object_ = pfn_clCreateFromD3D10BufferKHR(
            context(),
            flags,
            bufobj,
            &error);

        // TODO: This should really have a D3D10 rerror code!
        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    BufferD3D10() : Buffer() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with 
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : 
        Buffer(buffer, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferD3D10& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }
};
#endif

/*! \brief Class interface for GL Buffer Memory Objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class BufferGL : public Buffer
{
public:
    /*! \brief Constructs a BufferGL in a specified context, from a given
     *         GL buffer.
     *
     *  Wraps clCreateFromGLBuffer().
     */
    BufferGL(
        const Context& context,
        cl_mem_flags flags,
        cl_GLuint bufobj,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateFromGLBuffer(
            context(),
            flags,
            bufobj,
            &error);

        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    BufferGL() : Buffer() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit BufferGL(const cl_mem& buffer, bool retainObject = false) :
        Buffer(buffer, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferGL& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }


    //! \brief Wrapper for clGetGLObjectInfo().
    cl_int getObjectInfo(
        cl_gl_object_type *type,
        cl_GLuint * gl_object_name)
    {
        return detail::errHandler(
            ::clGetGLObjectInfo(object_,type,gl_object_name),
            __GET_GL_OBJECT_INFO_ERR);
    }
};

/*! \brief Class interface for GL Render Buffer Memory Objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class BufferRenderGL : public Buffer
{
public:
    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
     *         GL Renderbuffer.
     *
     *  Wraps clCreateFromGLRenderbuffer().
     */
    BufferRenderGL(
        const Context& context,
        cl_mem_flags flags,
        cl_GLuint bufobj,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateFromGLRenderbuffer(
            context(),
            flags,
            bufobj,
            &error);

        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    BufferRenderGL() : Buffer() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with 
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) :
        Buffer(buffer, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    BufferRenderGL& operator = (const cl_mem& rhs)
    {
        Buffer::operator=(rhs);
        return *this;
    }


    //! \brief Wrapper for clGetGLObjectInfo().
    cl_int getObjectInfo(
        cl_gl_object_type *type,
        cl_GLuint * gl_object_name)
    {
        return detail::errHandler(
            ::clGetGLObjectInfo(object_,type,gl_object_name),
            __GET_GL_OBJECT_INFO_ERR);
    }
};

/*! \brief C++ base class for Image Memory objects.
 *
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class Image : public Memory
{
protected:
    //! \brief Default constructor - initializes to nullptr.
    Image() : Memory() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image(const cl_mem& image, bool retainObject = false) :
        Memory(image, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image& operator = (const cl_mem& rhs)
    {
        Memory::operator=(rhs);
        return *this;
    }


public:
    //! \brief Wrapper for clGetImageInfo().
    template <typename T>
    cl_int getImageInfo(cl_image_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetImageInfo, object_, name, param),
            __GET_IMAGE_INFO_ERR);
    }
    
    //! \brief Wrapper for clGetImageInfo() that returns by value.
    template <cl_image_info name> typename
    detail::param_traits<detail::cl_image_info, name>::param_type
    getImageInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_image_info, name>::param_type param;
        cl_int result = getImageInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
};

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
/*! \brief Class interface for 1D Image Memory objects.
 *
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class Image1D : public Image
{
public:
    /*! \brief Constructs a 1D Image in a specified context.
     *
     *  Wraps clCreateImage().
     */
    Image1D(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type width,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE1D;
        desc.image_width = width;

        object_ = ::clCreateImage(
            context(), 
            flags, 
            &format, 
            &desc, 
            host_ptr, 
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    Image1D() { }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image1D with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Image1D(const Context &context, const vector<cl_mem_properties> &properties,
            cl_mem_flags flags, ImageFormat format, size_type width,
            void *host_ptr = nullptr, cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE1D;
      desc.image_width = width;

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, host_ptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, host_ptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image1D(const cl_mem& image1D, bool retainObject = false) :
        Image(image1D, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image1D& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }


};

/*! \class Image1DBuffer
 * \brief Image interface for 1D buffer images.
 */
class Image1DBuffer : public Image
{
public:
    Image1DBuffer(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type width,
        const Buffer &buffer,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
        desc.image_width = width;
        desc.buffer = buffer();

        object_ = ::clCreateImage(
            context(), 
            flags, 
            &format, 
            &desc, 
            nullptr, 
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    Image1DBuffer() { }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image1DBuffer with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param buffer Refer to a valid buffer or image memory object.
     */
    Image1DBuffer(const Context &context,
                  const vector<cl_mem_properties> &properties,
                  cl_mem_flags flags, ImageFormat format, size_type width,
                  const Buffer &buffer, cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
      desc.image_width = width;
      desc.buffer = buffer();

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, nullptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, nullptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) :
        Image(image1D, retainObject) { }

    Image1DBuffer& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }
};

/*! \class Image1DArray
 * \brief Image interface for arrays of 1D images.
 */
class Image1DArray : public Image
{
public:
    Image1DArray(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type arraySize,
        size_type width,
        size_type rowPitch,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
        desc.image_width = width;
        desc.image_array_size = arraySize;
        desc.image_row_pitch = rowPitch;

        object_ = ::clCreateImage(
            context(), 
            flags, 
            &format, 
            &desc, 
            host_ptr, 
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    Image1DArray() { }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image1DArray with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Image1DArray(const Context &context,
                 const vector<cl_mem_properties> &properties,
                 cl_mem_flags flags, ImageFormat format, size_type arraySize,
                 size_type width, size_type rowPitch = 0,
                 void *host_ptr = nullptr, cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
      desc.image_width = width;
      desc.image_array_size = arraySize;
      desc.image_row_pitch = rowPitch;

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, host_ptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, host_ptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) :
        Image(imageArray, retainObject) { }


    Image1DArray& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }


};
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120


/*! \brief Class interface for 2D Image Memory objects.
 *
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class Image2D : public Image
{
public:
    /*! \brief Constructs a 2D Image in a specified context.
     *
     *  Wraps clCreateImage().
     */
    Image2D(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type width,
        size_type height,
        size_type row_pitch = 0,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useCreateImage;

#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
        useCreateImage = true;
#else
        useCreateImage = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        if (useCreateImage)
        {
            cl_image_desc desc = {};
            desc.image_type = CL_MEM_OBJECT_IMAGE2D;
            desc.image_width = width;
            desc.image_height = height;
            desc.image_row_pitch = row_pitch;

            object_ = ::clCreateImage(
                context(),
                flags,
                &format,
                &desc,
                host_ptr,
                &error);

            detail::errHandler(error, __CREATE_IMAGE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        if (!useCreateImage)
        {
            object_ = ::clCreateImage2D(
                context(), flags,&format, width, height, row_pitch, host_ptr, &error);

            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    /*! \brief Constructs a 2D Image from a buffer.
    * \note This will share storage with the underlying buffer.
    *
    *  Requires OpenCL 2.0 or newer or OpenCL 1.2 and the 
    *  cl_khr_image2d_from_buffer extension.
    *
    *  Wraps clCreateImage().
    */
    Image2D(
        const Context& context,
        ImageFormat format,
        const Buffer &sourceBuffer,
        size_type width,
        size_type height,
        size_type row_pitch = 0,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
        desc.image_width = width;
        desc.image_height = height;
        desc.image_row_pitch = row_pitch;
        desc.buffer = sourceBuffer();

        object_ = ::clCreateImage(
            context(),
            0, // flags inherited from buffer
            &format,
            &desc,
            nullptr,
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
    /*! \brief Constructs a 2D Image from an image.
    * \note This will share storage with the underlying image but may
    *       reinterpret the channel order and type.
    *
    * The image will be created matching with a descriptor matching the source. 
    *
    * \param order is the channel order to reinterpret the image data as.
    *              The channel order may differ as described in the OpenCL 
    *              2.0 API specification.
    *
    * Wraps clCreateImage().
    */
    Image2D(
        const Context& context,
        cl_channel_order order,
        const Image &sourceImage,
        cl_int* err = nullptr)
    {
        cl_int error;

        // Descriptor fields have to match source image
        size_type sourceWidth = 
            sourceImage.getImageInfo<CL_IMAGE_WIDTH>();
        size_type sourceHeight = 
            sourceImage.getImageInfo<CL_IMAGE_HEIGHT>();
        size_type sourceRowPitch =
            sourceImage.getImageInfo<CL_IMAGE_ROW_PITCH>();
        cl_uint sourceNumMIPLevels =
            sourceImage.getImageInfo<CL_IMAGE_NUM_MIP_LEVELS>();
        cl_uint sourceNumSamples =
            sourceImage.getImageInfo<CL_IMAGE_NUM_SAMPLES>();
        cl_image_format sourceFormat =
            sourceImage.getImageInfo<CL_IMAGE_FORMAT>();

        // Update only the channel order. 
        // Channel format inherited from source.
        sourceFormat.image_channel_order = order;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE2D;
        desc.image_width = sourceWidth;
        desc.image_height = sourceHeight;
        desc.image_row_pitch = sourceRowPitch;
        desc.num_mip_levels = sourceNumMIPLevels;
        desc.num_samples = sourceNumSamples;
        desc.buffer = sourceImage();

        object_ = ::clCreateImage(
            context(),
            0, // flags should be inherited from mem_object
            &sourceFormat,
            &desc,
            nullptr,
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image2D with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Image2D(const Context &context, const vector<cl_mem_properties> &properties,
            cl_mem_flags flags, ImageFormat format, size_type width,
            size_type height, size_type row_pitch = 0, void *host_ptr = nullptr,
            cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
      desc.image_width = width;
      desc.image_height = height;
      desc.image_row_pitch = row_pitch;

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, host_ptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, host_ptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }

    /*! \brief Constructs a Image2D with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param buffer Refer to a valid buffer or image memory object.
     */
    Image2D(const Context &context, const vector<cl_mem_properties> &properties,
            cl_mem_flags flags, ImageFormat format, const Buffer &buffer,
            size_type width, size_type height, size_type row_pitch = 0,
            cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE2D;
      desc.image_width = width;
      desc.image_height = height;
      desc.image_row_pitch = row_pitch;
      desc.buffer = buffer();

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, nullptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, nullptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }

#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    //! \brief Default constructor - initializes to nullptr.
    Image2D() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image2D(const cl_mem& image2D, bool retainObject = false) :
        Image(image2D, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image2D& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }
};


#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
/*! \brief Class interface for GL 2D Image Memory objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
 */
class CL_API_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D 
{
public:
    /*! \brief Constructs an Image2DGL in a specified context, from a given
     *         GL Texture.
     *
     *  Wraps clCreateFromGLTexture2D().
     */
    Image2DGL(
        const Context& context,
        cl_mem_flags flags,
        cl_GLenum target,
        cl_GLint  miplevel,
        cl_GLuint texobj,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture2D(
            context(),
            flags,
            target,
            miplevel,
            texobj,
            &error);

        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
        if (err != nullptr) {
            *err = error;
        }

    }
    
    //! \brief Default constructor - initializes to nullptr.
    Image2DGL() : Image2D() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image2DGL(const cl_mem& image, bool retainObject = false) : 
        Image2D(image, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *c
     *  See Memory for further details.
     */
    Image2DGL& operator = (const cl_mem& rhs)
    {
        Image2D::operator=(rhs);
        return *this;
    }


} CL_API_SUFFIX__VERSION_1_1_DEPRECATED;
#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
/*! \class Image2DArray
 * \brief Image interface for arrays of 2D images.
 */
class Image2DArray : public Image
{
public:
    Image2DArray(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type arraySize,
        size_type width,
        size_type height,
        size_type rowPitch,
        size_type slicePitch,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_image_desc desc = {};
        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
        desc.image_width = width;
        desc.image_height = height;
        desc.image_array_size = arraySize;
        desc.image_row_pitch = rowPitch;
        desc.image_slice_pitch = slicePitch;

        object_ = ::clCreateImage(
            context(), 
            flags, 
            &format, 
            &desc, 
            host_ptr, 
            &error);

        detail::errHandler(error, __CREATE_IMAGE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image2DArray with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Image2DArray(const Context &context,
                 const vector<cl_mem_properties> &properties,
                 cl_mem_flags flags, ImageFormat format, size_type arraySize,
                 size_type width, size_type height, size_type rowPitch = 0,
                 size_type slicePitch = 0, void *host_ptr = nullptr,
                 cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
      desc.image_width = width;
      desc.image_height = height;
      desc.image_array_size = arraySize;
      desc.image_row_pitch = rowPitch;
      desc.image_slice_pitch = slicePitch;

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, host_ptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, host_ptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    Image2DArray() { }
    
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { }

    Image2DArray& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }

};
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120

/*! \brief Class interface for 3D Image Memory objects.
 *
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class Image3D : public Image
{
public:
    /*! \brief Constructs a 3D Image in a specified context.
     *
     *  Wraps clCreateImage().
     */
    Image3D(
        const Context& context,
        cl_mem_flags flags,
        ImageFormat format,
        size_type width,
        size_type height,
        size_type depth,
        size_type row_pitch = 0,
        size_type slice_pitch = 0,
        void* host_ptr = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useCreateImage;

#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 120
        useCreateImage = true;
#else
        useCreateImage = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        if (useCreateImage)
        {
            cl_image_desc desc = {};
            desc.image_type = CL_MEM_OBJECT_IMAGE3D;
            desc.image_width = width;
            desc.image_height = height;
            desc.image_depth = depth;
            desc.image_row_pitch = row_pitch;
            desc.image_slice_pitch = slice_pitch;

            object_ = ::clCreateImage(
                context(), 
                flags, 
                &format, 
                &desc, 
                host_ptr, 
                &error);

            detail::errHandler(error, __CREATE_IMAGE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif  // CL_HPP_TARGET_OPENCL_VERSION >= 120
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        if (!useCreateImage)
        {
            object_ = ::clCreateImage3D(
                context(), flags, &format, width, height, depth, row_pitch,
                slice_pitch, host_ptr, &error);

            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 300
    /*! \brief Constructs a Image3D with specified properties.
     *
     *  Wraps clCreateImageWithProperties().
     *
     *  \param properties Optional list of properties for the image object and
     *                    their corresponding values. The non-empty list must
     *                    end with 0.
     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
     *                  specified. Note alignment & exclusivity requirements.
     */
    Image3D(const Context &context, const vector<cl_mem_properties> &properties,
            cl_mem_flags flags, ImageFormat format, size_type width,
            size_type height, size_type depth, size_type row_pitch = 0,
            size_type slice_pitch = 0, void *host_ptr = nullptr,
            cl_int *err = nullptr) {
      cl_int error;

      cl_image_desc desc = {};
      desc.image_type = CL_MEM_OBJECT_IMAGE3D;
      desc.image_width = width;
      desc.image_height = height;
      desc.image_depth = depth;
      desc.image_row_pitch = row_pitch;
      desc.image_slice_pitch = slice_pitch;

      if (properties.empty()) {
        object_ = ::clCreateImageWithProperties(
            context(), nullptr, flags, &format, &desc, host_ptr, &error);
      } else {
        object_ =
            ::clCreateImageWithProperties(context(), properties.data(), flags,
                                          &format, &desc, host_ptr, &error);
      }

      detail::errHandler(error, __CREATE_IMAGE_ERR);
      if (err != nullptr) {
        *err = error;
      }
    }
#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 300

    //! \brief Default constructor - initializes to nullptr.
    Image3D() : Image() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image3D(const cl_mem& image3D, bool retainObject = false) : 
        Image(image3D, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image3D& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }

};

#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
/*! \brief Class interface for GL 3D Image Memory objects.
 *
 *  This is provided to facilitate interoperability with OpenGL.
 * 
 *  See Memory for details about copy semantics, etc.
 * 
 *  \see Memory
 */
class Image3DGL : public Image3D
{
public:
    /*! \brief Constructs an Image3DGL in a specified context, from a given
     *         GL Texture.
     *
     *  Wraps clCreateFromGLTexture3D().
     */
    Image3DGL(
        const Context& context,
        cl_mem_flags flags,
        cl_GLenum target,
        cl_GLint  miplevel,
        cl_GLuint texobj,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture3D(
            context(),
            flags,
            target,
            miplevel,
            texobj,
            &error);

        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    Image3DGL() : Image3D() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit Image3DGL(const cl_mem& image, bool retainObject = false) : 
        Image3D(image, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Image3DGL& operator = (const cl_mem& rhs)
    {
        Image3D::operator=(rhs);
        return *this;
    }

};
#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
/*! \class ImageGL
 * \brief general image interface for GL interop.
 * We abstract the 2D and 3D GL images into a single instance here
 * that wraps all GL sourced images on the grounds that setup information
 * was performed by OpenCL anyway.
 */
class ImageGL : public Image
{
public:
    ImageGL(
        const Context& context,
        cl_mem_flags flags,
        cl_GLenum target,
        cl_GLint  miplevel,
        cl_GLuint texobj,
        cl_int * err = nullptr)
    {
        cl_int error;
        object_ = ::clCreateFromGLTexture(
            context(), 
            flags, 
            target,
            miplevel,
            texobj,
            &error);

        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    ImageGL() : Image() { }
    
    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  See Memory for further details.
     */
    explicit ImageGL(const cl_mem& image, bool retainObject = false) : 
        Image(image, retainObject) { }

    ImageGL& operator = (const cl_mem& rhs)
    {
        Image::operator=(rhs);
        return *this;
    }

};
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
/*! \brief Class interface for Pipe Memory Objects.
*
*  See Memory for details about copy semantics, etc.
*
*  \see Memory
*/
class Pipe : public Memory
{
public:

    /*! \brief Constructs a Pipe in a specified context.
     *
     * Wraps clCreatePipe().
     * @param context Context in which to create the pipe.
     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
     * @param packet_size Size in bytes of a single packet of the pipe.
     * @param max_packets Number of packets that may be stored in the pipe.
     *
     */
    Pipe(
        const Context& context,
        cl_uint packet_size,
        cl_uint max_packets,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);

        detail::errHandler(error, __CREATE_PIPE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /*! \brief Constructs a Pipe in a the default context.
     *
     * Wraps clCreatePipe().
     * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid.
     * @param packet_size Size in bytes of a single packet of the pipe.
     * @param max_packets Number of packets that may be stored in the pipe.
     *
     */
    Pipe(
        cl_uint packet_size,
        cl_uint max_packets,
        cl_int* err = nullptr)
    {
        cl_int error;

        Context context = Context::getDefault(err);

        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS;
        object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error);

        detail::errHandler(error, __CREATE_PIPE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    //! \brief Default constructor - initializes to nullptr.
    Pipe() : Memory() { }

    /*! \brief Constructor from cl_mem - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with earlier versions.
     *
     *  See Memory for further details.
     */
    explicit Pipe(const cl_mem& pipe, bool retainObject = false) :
        Memory(pipe, retainObject) { }

    /*! \brief Assignment from cl_mem - performs shallow copy.
     *
     *  See Memory for further details.
     */
    Pipe& operator = (const cl_mem& rhs)
    {
        Memory::operator=(rhs);
        return *this;
    }


    //! \brief Wrapper for clGetMemObjectInfo().
    template <typename T>
    cl_int getInfo(cl_pipe_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetPipeInfo, object_, name, param),
            __GET_PIPE_INFO_ERR);
    }

    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
    template <cl_pipe_info name> typename
        detail::param_traits<detail::cl_pipe_info, name>::param_type
        getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_pipe_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
}; // class Pipe
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200


/*! \brief Class interface for cl_sampler.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_sampler as the original.  For details, see
 *        clRetainSampler() and clReleaseSampler().
 *
 *  \see cl_sampler 
 */
class Sampler : public detail::Wrapper<cl_sampler>
{
public:
    //! \brief Default constructor - initializes to nullptr.
    Sampler() { }

    /*! \brief Constructs a Sampler in a specified context.
     *
     *  Wraps clCreateSampler().
     */
    Sampler(
        const Context& context,
        cl_bool normalized_coords,
        cl_addressing_mode addressing_mode,
        cl_filter_mode filter_mode,
        cl_int* err = nullptr)
    {
        cl_int error;

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        cl_sampler_properties sampler_properties[] = {
            CL_SAMPLER_NORMALIZED_COORDS, normalized_coords,
            CL_SAMPLER_ADDRESSING_MODE, addressing_mode,
            CL_SAMPLER_FILTER_MODE, filter_mode,
            0 };
        object_ = ::clCreateSamplerWithProperties(
            context(),
            sampler_properties,
            &error);

        detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }
#else
        object_ = ::clCreateSampler(
            context(),
            normalized_coords,
            addressing_mode,
            filter_mode,
            &error);

        detail::errHandler(error, __CREATE_SAMPLER_ERR);
        if (err != nullptr) {
            *err = error;
        }
#endif        
    }

    /*! \brief Constructor from cl_sampler - takes ownership.
     * 
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  This effectively transfers ownership of a refcount on the cl_sampler
     *  into the new Sampler object.
     */
    explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : 
        detail::Wrapper<cl_type>(sampler, retainObject) { }

    /*! \brief Assignment operator from cl_sampler - takes ownership.
     *
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseSampler() on the value previously held by this instance.
     */
    Sampler& operator = (const cl_sampler& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

  
    //! \brief Wrapper for clGetSamplerInfo().
    template <typename T>
    cl_int getInfo(cl_sampler_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
            __GET_SAMPLER_INFO_ERR);
    }

    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
    template <cl_sampler_info name> typename
    detail::param_traits<detail::cl_sampler_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_sampler_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
};

class Program;
class CommandQueue;
class DeviceCommandQueue;
class Kernel;

//! \brief Class interface for specifying NDRange values.
class NDRange
{
private:
    size_type sizes_[3];
    cl_uint dimensions_;

public:
    //! \brief Default constructor - resulting range has zero dimensions.
    NDRange()
        : dimensions_(0)
    {
        sizes_[0] = 0;
        sizes_[1] = 0;
        sizes_[2] = 0;
    }

    //! \brief Constructs one-dimensional range.
    NDRange(size_type size0)
        : dimensions_(1)
    {
        sizes_[0] = size0;
        sizes_[1] = 1;
        sizes_[2] = 1;
    }

    //! \brief Constructs two-dimensional range.
    NDRange(size_type size0, size_type size1)
        : dimensions_(2)
    {
        sizes_[0] = size0;
        sizes_[1] = size1;
        sizes_[2] = 1;
    }

    //! \brief Constructs three-dimensional range.
    NDRange(size_type size0, size_type size1, size_type size2)
        : dimensions_(3)
    {
        sizes_[0] = size0;
        sizes_[1] = size1;
        sizes_[2] = size2;
    }

    //! \brief Constructs one-dimensional range.
    NDRange(array<size_type, 1> a) : NDRange(a[0]){}

    //! \brief Constructs two-dimensional range.
    NDRange(array<size_type, 2> a) : NDRange(a[0], a[1]){}

    //! \brief Constructs three-dimensional range.
    NDRange(array<size_type, 3> a) : NDRange(a[0], a[1], a[2]){}

    /*! \brief Conversion operator to const size_type *.
     *  
     *  \returns a pointer to the size of the first dimension.
     */
    operator const size_type*() const { 
        return sizes_; 
    }

    //! \brief Queries the number of dimensions in the range.
    size_type dimensions() const 
    { 
        return dimensions_; 
    }

    //! \brief Returns the size of the object in bytes based on the
    // runtime number of dimensions
    size_type size() const
    {
        return dimensions_*sizeof(size_type);
    }

    size_type* get()
    {
        return sizes_;
    }
    
    const size_type* get() const
    {
        return sizes_;
    }
};

//! \brief A zero-dimensional range.
static const NDRange NullRange;

//! \brief Local address wrapper for use with Kernel::setArg
struct LocalSpaceArg
{
    size_type size_;
};

namespace detail {

template <typename T, class Enable = void>
struct KernelArgumentHandler;

// Enable for objects that are not subclasses of memory
// Pointers, constants etc
template <typename T>
struct KernelArgumentHandler<T, typename std::enable_if<!std::is_base_of<cl::Memory, T>::value>::type>
{
    static size_type size(const T&) { return sizeof(T); }
    static const T* ptr(const T& value) { return &value; }
};

// Enable for subclasses of memory where we want to get a reference to the cl_mem out
// and pass that in for safety
template <typename T>
struct KernelArgumentHandler<T, typename std::enable_if<std::is_base_of<cl::Memory, T>::value>::type>
{
    static size_type size(const T&) { return sizeof(cl_mem); }
    static const cl_mem* ptr(const T& value) { return &(value()); }
};

// Specialization for DeviceCommandQueue defined later

template <>
struct KernelArgumentHandler<LocalSpaceArg, void>
{
    static size_type size(const LocalSpaceArg& value) { return value.size_; }
    static const void* ptr(const LocalSpaceArg&) { return nullptr; }
};

} 
//! \endcond

/*! Local
 * \brief Helper function for generating LocalSpaceArg objects.
 */
inline LocalSpaceArg
Local(size_type size)
{
    LocalSpaceArg ret = { size };
    return ret;
}

/*! \brief Class interface for cl_kernel.
 *
 *  \note Copies of these objects are shallow, meaning that the copy will refer
 *        to the same underlying cl_kernel as the original.  For details, see
 *        clRetainKernel() and clReleaseKernel().
 *
 *  \see cl_kernel
 */
class Kernel : public detail::Wrapper<cl_kernel>
{
public:
    inline Kernel(const Program& program, const string& name, cl_int* err = nullptr);
    inline Kernel(const Program& program, const char* name, cl_int* err = nullptr);

    //! \brief Default constructor - initializes to nullptr.
    Kernel() { }

    /*! \brief Constructor from cl_kernel - takes ownership.
     * 
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     *  This effectively transfers ownership of a refcount on the cl_kernel
     *  into the new Kernel object.
     */
    explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : 
        detail::Wrapper<cl_type>(kernel, retainObject) { }

    /*! \brief Assignment operator from cl_kernel - takes ownership.
     *
     *  This effectively transfers ownership of a refcount on the rhs and calls
     *  clReleaseKernel() on the value previously held by this instance.
     */
    Kernel& operator = (const cl_kernel& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }


    template <typename T>
    cl_int getInfo(cl_kernel_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetKernelInfo, object_, name, param),
            __GET_KERNEL_INFO_ERR);
    }

    template <cl_kernel_info name> typename
    detail::param_traits<detail::cl_kernel_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_kernel_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    template <typename T>
    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
            __GET_KERNEL_ARG_INFO_ERR);
    }

    template <cl_kernel_arg_info name> typename
    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
    getArgInfo(cl_uint argIndex, cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_kernel_arg_info, name>::param_type param;
        cl_int result = getArgInfo(argIndex, name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

    template <typename T>
    cl_int getWorkGroupInfo(
        const Device& device, cl_kernel_work_group_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(
                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
                __GET_KERNEL_WORK_GROUP_INFO_ERR);
    }

    template <cl_kernel_work_group_info name> typename
    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
        getWorkGroupInfo(const Device& device, cl_int* err = nullptr) const
    {
        typename detail::param_traits<
        detail::cl_kernel_work_group_info, name>::param_type param;
        cl_int result = getWorkGroupInfo(device, name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
    
#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
    cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const
    {
#if CL_HPP_TARGET_OPENCL_VERSION >= 210

        return detail::errHandler(
            clGetKernelSubGroupInfo(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
            __GET_KERNEL_SUB_GROUP_INFO_ERR);

#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

        typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR;
        static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = nullptr;
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR);

        return detail::errHandler(
            pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr),
            __GET_KERNEL_SUB_GROUP_INFO_ERR);

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
    }

    template <cl_kernel_sub_group_info name>
        size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = nullptr) const
    {
        size_type param;
        cl_int result = getSubGroupInfo(dev, name, range, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
#endif // defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
    /*! \brief setArg overload taking a shared_ptr type
     */
    template<typename T, class D>
    cl_int setArg(cl_uint index, const cl::pointer<T, D> &argPtr)
    {
        return detail::errHandler(
            ::clSetKernelArgSVMPointer(object_, index, argPtr.get()),
            __SET_KERNEL_ARGS_ERR);
    }

    /*! \brief setArg overload taking a vector type.
     */
    template<typename T, class Alloc>
    cl_int setArg(cl_uint index, const cl::vector<T, Alloc> &argPtr)
    {
        return detail::errHandler(
            ::clSetKernelArgSVMPointer(object_, index, argPtr.data()),
            __SET_KERNEL_ARGS_ERR);
    }

    /*! \brief setArg overload taking a pointer type
     */
    template<typename T>
    typename std::enable_if<std::is_pointer<T>::value, cl_int>::type
        setArg(cl_uint index, const T argPtr)
    {
        return detail::errHandler(
            ::clSetKernelArgSVMPointer(object_, index, argPtr),
            __SET_KERNEL_ARGS_ERR);
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

    /*! \brief setArg overload taking a POD type
     */
    template <typename T>
    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
        setArg(cl_uint index, const T &value)
    {
        return detail::errHandler(
            ::clSetKernelArg(
                object_,
                index,
                detail::KernelArgumentHandler<T>::size(value),
                detail::KernelArgumentHandler<T>::ptr(value)),
            __SET_KERNEL_ARGS_ERR);
    }

    cl_int setArg(cl_uint index, size_type size, const void* argPtr)
    {
        return detail::errHandler(
            ::clSetKernelArg(object_, index, size, argPtr),
            __SET_KERNEL_ARGS_ERR);
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
    /*!
     * Specify a vector of SVM pointers that the kernel may access in 
     * addition to its arguments.
     */
    cl_int setSVMPointers(const vector<void*> &pointerList)
    {
        return detail::errHandler(
            ::clSetKernelExecInfo(
                object_,
                CL_KERNEL_EXEC_INFO_SVM_PTRS,
                sizeof(void*)*pointerList.size(),
                pointerList.data()));
    }

    /*!
     * Specify a std::array of SVM pointers that the kernel may access in
     * addition to its arguments.
     */
    template<int ArrayLength>
    cl_int setSVMPointers(const std::array<void*, ArrayLength> &pointerList)
    {
        return detail::errHandler(
            ::clSetKernelExecInfo(
                object_,
                CL_KERNEL_EXEC_INFO_SVM_PTRS,
                sizeof(void*)*pointerList.size(),
                pointerList.data()));
    }

    /*! \brief Enable fine-grained system SVM.
     *
     * \note It is only possible to enable fine-grained system SVM if all devices
     *       in the context associated with kernel support it.
     * 
     * \param svmEnabled True if fine-grained system SVM is requested. False otherwise.
     * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION
     *         if no devices in the context support fine-grained system SVM.
     *
     * \see clSetKernelExecInfo
     */
    cl_int enableFineGrainedSystemSVM(bool svmEnabled)
    {
        cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE;
        return detail::errHandler(
            ::clSetKernelExecInfo(
                object_,
                CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM,
                sizeof(cl_bool),
                &svmEnabled_
                )
            );
    }
    
    template<int index, int ArrayLength, class D, typename T0, typename T1, typename... Ts>
    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0, const pointer<T1, D> &t1, Ts & ... ts)
    {
        pointerList[index] = static_cast<void*>(t0.get());
        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
    }

    template<int index, int ArrayLength, typename T0, typename T1, typename... Ts>
    typename std::enable_if<std::is_pointer<T0>::value, void>::type
    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0, T1 t1, Ts... ts)
    {
        pointerList[index] = static_cast<void*>(t0);
        setSVMPointersHelper<index + 1, ArrayLength>(pointerList, t1, ts...);
    }

    template<int index, int ArrayLength, typename T0, class D>
    void setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, const pointer<T0, D> &t0)
    {
        pointerList[index] = static_cast<void*>(t0.get());
    }


    template<int index, int ArrayLength, typename T0>
    typename std::enable_if<std::is_pointer<T0>::value, void>::type
    setSVMPointersHelper(std::array<void*, ArrayLength> &pointerList, T0 t0)
    {
        pointerList[index] = static_cast<void*>(t0);
    }

    template<typename T0, typename... Ts>
    cl_int setSVMPointers(const T0 &t0, Ts & ... ts)
    {
        std::array<void*, 1 + sizeof...(Ts)> pointerList;

        setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...);
        return detail::errHandler(
            ::clSetKernelExecInfo(
            object_,
            CL_KERNEL_EXEC_INFO_SVM_PTRS,
            sizeof(void*)*(1 + sizeof...(Ts)),
            pointerList.data()));
    }

    template<typename T>
    cl_int setExecInfo(cl_kernel_exec_info param_name, const T& val)
    {
        return detail::errHandler(
            ::clSetKernelExecInfo(
            object_,
            param_name,
            sizeof(T),
            &val));
    }

    template<cl_kernel_exec_info name>
    cl_int setExecInfo(typename detail::param_traits<detail::cl_kernel_exec_info, name>::param_type& val)
    {
        return setExecInfo(name, val);
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

#if CL_HPP_TARGET_OPENCL_VERSION >= 210
    /**
     * Make a deep copy of the kernel object including its arguments.
     * @return A new kernel object with internal state entirely separate from that
     *         of the original but with any arguments set on the original intact.
     */
    Kernel clone()
    {
        cl_int error;
        Kernel retValue(clCloneKernel(this->get(), &error));

        detail::errHandler(error, __CLONE_KERNEL_ERR);
        return retValue;
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
};

/*! \class Program
 * \brief Program interface that implements cl_program.
 */
class Program : public detail::Wrapper<cl_program>
{
public:
#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
    typedef vector<vector<unsigned char>> Binaries;
    typedef vector<string> Sources;
#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
    typedef vector<std::pair<const void*, size_type> > Binaries;
    typedef vector<std::pair<const char*, size_type> > Sources;
#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
    
    Program(
        const string& source,
        bool build = false,
        cl_int* err = nullptr)
    {
        cl_int error;

        const char * strings = source.c_str();
        const size_type length  = source.size();

        Context context = Context::getDefault(err);

        object_ = ::clCreateProgramWithSource(
            context(), (cl_uint)1, &strings, &length, &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);

        if (error == CL_SUCCESS && build) {

            error = ::clBuildProgram(
                object_,
                0,
                nullptr,
#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                "-cl-std=CL2.0",
#else
                "",
#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                nullptr,
                nullptr);

            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
        }

        if (err != nullptr) {
            *err = error;
        }
    }

    Program(
        const Context& context,
        const string& source,
        bool build = false,
        cl_int* err = nullptr)
    {
        cl_int error;

        const char * strings = source.c_str();
        const size_type length  = source.size();

        object_ = ::clCreateProgramWithSource(
            context(), (cl_uint)1, &strings, &length, &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);

        if (error == CL_SUCCESS && build) {
            error = ::clBuildProgram(
                object_,
                0,
                nullptr,
#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                "-cl-std=CL2.0",
#else
                "",
#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                nullptr,
                nullptr);
            
            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
        }

        if (err != nullptr) {
            *err = error;
        }
    }

    /**
     * Create a program from a vector of source strings and the default context.
     * Does not compile or link the program.
     */
    Program(
        const Sources& sources,
        cl_int* err = nullptr)
    {
        cl_int error;
        Context context = Context::getDefault(err);

        const size_type n = (size_type)sources.size();

        vector<size_type> lengths(n);
        vector<const char*> strings(n);

        for (size_type i = 0; i < n; ++i) {
#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
            strings[i] = sources[(int)i].data();
            lengths[i] = sources[(int)i].length();
#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
            strings[i] = sources[(int)i].first;
            lengths[i] = sources[(int)i].second;
#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
        }

        object_ = ::clCreateProgramWithSource(
            context(), (cl_uint)n, strings.data(), lengths.data(), &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /**
     * Create a program from a vector of source strings and a provided context.
     * Does not compile or link the program.
     */
    Program(
        const Context& context,
        const Sources& sources,
        cl_int* err = nullptr)
    {
        cl_int error;

        const size_type n = (size_type)sources.size();

        vector<size_type> lengths(n);
        vector<const char*> strings(n);

        for (size_type i = 0; i < n; ++i) {
#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
            strings[i] = sources[(int)i].data();
            lengths[i] = sources[(int)i].length();
#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
            strings[i] = sources[(int)i].first;
            lengths[i] = sources[(int)i].second;
#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
        }

        object_ = ::clCreateProgramWithSource(
            context(), (cl_uint)n, strings.data(), lengths.data(), &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

#if defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210
    /**
     * Program constructor to allow construction of program from SPIR-V or another IL.
     *
     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
     */
    Program(
        const vector<char>& IL,
        bool build = false,
        cl_int* err = nullptr)
    {
        cl_int error;

        Context context = Context::getDefault(err);

#if CL_HPP_TARGET_OPENCL_VERSION >= 210

        object_ = ::clCreateProgramWithIL(
            context(), static_cast<const void*>(IL.data()), IL.size(), &error);

#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);

        object_ = pfn_clCreateProgramWithILKHR(
                context(), static_cast<const void*>(IL.data()), IL.size(), &error);

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);

        if (error == CL_SUCCESS && build) {

            error = ::clBuildProgram(
                object_,
                0,
                nullptr,
#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                "-cl-std=CL2.0",
#else
                "",
#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                nullptr,
                nullptr);

            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
        }

        if (err != nullptr) {
            *err = error;
        }
    }

    /**
     * Program constructor to allow construction of program from SPIR-V or another IL
     * for a specific context.
     *
     * Requires OpenCL 2.1 or newer or the cl_khr_il_program extension.
     */
    Program(
        const Context& context,
        const vector<char>& IL,
        bool build = false,
        cl_int* err = nullptr)
    {
        cl_int error;

#if CL_HPP_TARGET_OPENCL_VERSION >= 210

        object_ = ::clCreateProgramWithIL(
            context(), static_cast<const void*>(IL.data()), IL.size(), &error);

#else // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

        typedef clCreateProgramWithILKHR_fn PFN_clCreateProgramWithILKHR;
        static PFN_clCreateProgramWithILKHR pfn_clCreateProgramWithILKHR = nullptr;
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateProgramWithILKHR);

        object_ = pfn_clCreateProgramWithILKHR(
            context(), static_cast<const void*>(IL.data()), IL.size(), &error);

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210

        detail::errHandler(error, __CREATE_PROGRAM_WITH_IL_ERR);

        if (error == CL_SUCCESS && build) {
            error = ::clBuildProgram(
                object_,
                0,
                nullptr,
#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                "-cl-std=CL2.0",
#else
                "",
#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD)
                nullptr,
                nullptr);

            detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
        }

        if (err != nullptr) {
            *err = error;
        }
    }
#endif // defined(CL_HPP_USE_IL_KHR) || CL_HPP_TARGET_OPENCL_VERSION >= 210

    /**
     * Construct a program object from a list of devices and a per-device list of binaries.
     * \param context A valid OpenCL context in which to construct the program.
     * \param devices A vector of OpenCL device objects for which the program will be created.
     * \param binaries A vector of pairs of a pointer to a binary object and its length.
     * \param binaryStatus An optional vector that on completion will be resized to
     *   match the size of binaries and filled with values to specify if each binary
     *   was successfully loaded.
     *   Set to CL_SUCCESS if the binary was successfully loaded.
     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is nullptr.
     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
     * \param err if non-nullptr will be set to CL_SUCCESS on successful operation or one of the following errors:
     *   CL_INVALID_CONTEXT if context is not a valid context.
     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
     *     or if any entry in binaries is nullptr or has length 0.
     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
     */
    Program(
        const Context& context,
        const vector<Device>& devices,
        const Binaries& binaries,
        vector<cl_int>* binaryStatus = nullptr,
        cl_int* err = nullptr)
    {
        cl_int error;
        
        const size_type numDevices = devices.size();
        
        // Catch size mismatch early and return
        if(binaries.size() != numDevices) {
            error = CL_INVALID_VALUE;
            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
            if (err != nullptr) {
                *err = error;
            }
            return;
        }

        vector<size_type> lengths(numDevices);
        vector<const unsigned char*> images(numDevices);
#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
        for (size_type i = 0; i < numDevices; ++i) {
            images[i] = binaries[i].data();
            lengths[i] = binaries[(int)i].size();
        }
#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)
        for (size_type i = 0; i < numDevices; ++i) {
            images[i] = (const unsigned char*)binaries[i].first;
            lengths[i] = binaries[(int)i].second;
        }
#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY)

        vector<cl_device_id> deviceIDs(numDevices);
        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
            deviceIDs[deviceIndex] = (devices[deviceIndex])();
        }

        if(binaryStatus) {
            binaryStatus->resize(numDevices);
        }

        object_ = ::clCreateProgramWithBinary(
            context(), (cl_uint) devices.size(),
            deviceIDs.data(),
            lengths.data(), images.data(), (binaryStatus != nullptr && numDevices > 0)
               ? &binaryStatus->front()
               : nullptr, &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    /**
     * Create program using builtin kernels.
     * \param kernelNames Semi-colon separated list of builtin kernel names
     */
    Program(
        const Context& context,
        const vector<Device>& devices,
        const string& kernelNames,
        cl_int* err = nullptr)
    {
        cl_int error;


        size_type numDevices = devices.size();
        vector<cl_device_id> deviceIDs(numDevices);
        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
            deviceIDs[deviceIndex] = (devices[deviceIndex])();
        }
        
        object_ = ::clCreateProgramWithBuiltInKernels(
            context(), 
            (cl_uint) devices.size(),
            deviceIDs.data(),
            kernelNames.c_str(), 
            &error);

        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

    Program() { }
    

    /*! \brief Constructor from cl_program - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     */
    explicit Program(const cl_program& program, bool retainObject = false) : 
        detail::Wrapper<cl_type>(program, retainObject) { }

    Program& operator = (const cl_program& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    cl_int build(
        const vector<Device>& devices,
        const string& options,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return build(devices, options.c_str(), notifyFptr, data);
    }

    cl_int build(
        const vector<Device>& devices,
        const char* options = nullptr,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        size_type numDevices = devices.size();
        vector<cl_device_id> deviceIDs(numDevices);

        for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
            deviceIDs[deviceIndex] = (devices[deviceIndex])();
        }

        cl_int buildError = ::clBuildProgram(
            object_,
            (cl_uint)
            devices.size(),
            deviceIDs.data(),
            options,
            notifyFptr,
            data);

        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
    }

    cl_int build(
        const Device& device,
        const string& options,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return build(device, options.c_str(), notifyFptr, data);
    }

    cl_int build(
        const Device& device,
        const char* options = nullptr,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        cl_device_id deviceID = device();

        cl_int buildError = ::clBuildProgram(
            object_,
            1,
            &deviceID,
            options,
            notifyFptr,
            data);

        BuildLogType buildLog(0);
        buildLog.push_back(std::make_pair(device, getBuildInfo<CL_PROGRAM_BUILD_LOG>(device)));
        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, buildLog);
    }

    cl_int build(
        const string& options,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return build(options.c_str(), notifyFptr, data);
    }

    cl_int build(
        const char* options = nullptr,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        cl_int buildError = ::clBuildProgram(
            object_,
            0,
            nullptr,
            options,
            notifyFptr,
            data);

        return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    cl_int compile(
        const string& options,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return compile(options.c_str(), notifyFptr, data);
    }

    cl_int compile(
        const char* options = nullptr,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        cl_int error = ::clCompileProgram(
            object_,
            0,
            nullptr,
            options,
            0,
            nullptr,
            nullptr,
            notifyFptr,
            data);
        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
    }

    cl_int compile(
        const string& options,
        const vector<Program>& inputHeaders,
        const vector<string>& headerIncludeNames,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return compile(options.c_str(), inputHeaders, headerIncludeNames, notifyFptr, data);
    }

    cl_int compile(
        const char* options,
        const vector<Program>& inputHeaders,
        const vector<string>& headerIncludeNames,
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        static_assert(sizeof(cl::Program) == sizeof(cl_program),
            "Size of cl::Program must be equal to size of cl_program");
        vector<const char*> headerIncludeNamesCStr;
        for(const string& name: headerIncludeNames) {
            headerIncludeNamesCStr.push_back(name.c_str());
        }
        cl_int error = ::clCompileProgram(
            object_,
            0,
            nullptr,
            options,
            static_cast<cl_uint>(inputHeaders.size()),
            reinterpret_cast<const cl_program*>(inputHeaders.data()),
            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),
            notifyFptr,
            data);
        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
    }

    cl_int compile(
        const string& options,
        const vector<Device>& deviceList,
        const vector<Program>& inputHeaders = vector<Program>(),
        const vector<string>& headerIncludeNames = vector<string>(),
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        return compile(options.c_str(), deviceList, inputHeaders, headerIncludeNames, notifyFptr, data);
    }

    cl_int compile(
        const char* options,
        const vector<Device>& deviceList,
        const vector<Program>& inputHeaders = vector<Program>(),
        const vector<string>& headerIncludeNames = vector<string>(),
        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
        void* data = nullptr) const
    {
        static_assert(sizeof(cl::Program) == sizeof(cl_program),
            "Size of cl::Program must be equal to size of cl_program");
        vector<const char*> headerIncludeNamesCStr;
        for(const string& name: headerIncludeNames) {
            headerIncludeNamesCStr.push_back(name.c_str());
        }
        vector<cl_device_id> deviceIDList;
        for(const Device& device: deviceList) {
            deviceIDList.push_back(device());
        }
        cl_int error = ::clCompileProgram(
            object_,
            static_cast<cl_uint>(deviceList.size()),
            reinterpret_cast<const cl_device_id*>(deviceIDList.data()),
            options,
            static_cast<cl_uint>(inputHeaders.size()),
            reinterpret_cast<const cl_program*>(inputHeaders.data()),
            reinterpret_cast<const char**>(headerIncludeNamesCStr.data()),
            notifyFptr,
            data);
        return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo<CL_PROGRAM_BUILD_LOG>());
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

    template <typename T>
    cl_int getInfo(cl_program_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(&::clGetProgramInfo, object_, name, param),
            __GET_PROGRAM_INFO_ERR);
    }

    template <cl_program_info name> typename
    detail::param_traits<detail::cl_program_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_program_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    template <typename T>
    cl_int getBuildInfo(
        const Device& device, cl_program_build_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(
                &::clGetProgramBuildInfo, object_, device(), name, param),
                __GET_PROGRAM_BUILD_INFO_ERR);
    }

    template <cl_program_build_info name> typename
    detail::param_traits<detail::cl_program_build_info, name>::param_type
    getBuildInfo(const Device& device, cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_program_build_info, name>::param_type param;
        cl_int result = getBuildInfo(device, name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
    
    /**
     * Build info function that returns a vector of device/info pairs for the specified 
     * info type and for all devices in the program.
     * On an error reading the info for any device, an empty vector of info will be returned.
     */
    template <cl_program_build_info name>
    vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
        getBuildInfo(cl_int *err = nullptr) const
    {
        cl_int result = CL_SUCCESS;

        auto devs = getInfo<CL_PROGRAM_DEVICES>(&result);
        vector<std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>>
            devInfo;

        // If there was an initial error from getInfo return the error
        if (result != CL_SUCCESS) {
            if (err != nullptr) {
                *err = result;
            }
            return devInfo;
        }

        for (const cl::Device &d : devs) {
            typename detail::param_traits<
                detail::cl_program_build_info, name>::param_type param;
            result = getBuildInfo(d, name, &param);
            devInfo.push_back(
                std::pair<cl::Device, typename detail::param_traits<detail::cl_program_build_info, name>::param_type>
                (d, param));
            if (result != CL_SUCCESS) {
                // On error, leave the loop and return the error code
                break;
            }
        }
        if (err != nullptr) {
            *err = result;
        }
        if (result != CL_SUCCESS) {
            devInfo.clear();
        }
        return devInfo;
    }

    cl_int createKernels(vector<Kernel>* kernels)
    {
        cl_uint numKernels;
        cl_int err = ::clCreateKernelsInProgram(object_, 0, nullptr, &numKernels);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
        }

        vector<cl_kernel> value(numKernels);
        
        err = ::clCreateKernelsInProgram(
            object_, numKernels, value.data(), nullptr);
        if (err != CL_SUCCESS) {
            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
        }

        if (kernels) {
            kernels->resize(value.size());

            // Assign to param, constructing with retain behaviour
            // to correctly capture each underlying CL object
            for (size_type i = 0; i < value.size(); i++) {
                // We do not need to retain because this kernel is being created 
                // by the runtime
                (*kernels)[i] = Kernel(value[i], false);
            }
        }
        return CL_SUCCESS;
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 220
#if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
    /*! \brief Registers a callback function to be called when destructors for
     *         program scope global variables are complete and before the
     *         program is released.
     *
     *  Wraps clSetProgramReleaseCallback().
     *
     *  Each call to this function registers the specified user callback function
     *  on a callback stack associated with program. The registered user callback
     *  functions are called in the reverse order in which they were registered.
     */
    CL_API_PREFIX__VERSION_2_2_DEPRECATED cl_int setReleaseCallback(
        void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data),
        void * user_data = nullptr) CL_API_SUFFIX__VERSION_2_2_DEPRECATED
    {
        return detail::errHandler(
            ::clSetProgramReleaseCallback(
                object_,
                pfn_notify,
                user_data),
            __SET_PROGRAM_RELEASE_CALLBACK_ERR);
    }
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)

    /*! \brief Sets a SPIR-V specialization constant.
     *
     *  Wraps clSetProgramSpecializationConstant().
     */
    template <typename T>
    typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
        setSpecializationConstant(cl_uint index, const T &value)
    {
        return detail::errHandler(
            ::clSetProgramSpecializationConstant(
                object_,
                index,
                sizeof(value),
                &value),
            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
    }

    /*! \brief Sets a SPIR-V specialization constant.
     *
     *  Wraps clSetProgramSpecializationConstant().
     */
    cl_int setSpecializationConstant(cl_uint index, size_type size, const void* value)
    {
        return detail::errHandler(
            ::clSetProgramSpecializationConstant(
                object_,
                index,
                size,
                value),
            __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220
};

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
inline Program linkProgram(
    const Program& input1,
    const Program& input2,
    const char* options = nullptr,
    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
    void* data = nullptr,
    cl_int* err = nullptr)
{
    cl_int error_local = CL_SUCCESS;
    cl_program programs[2] = { input1(), input2() };

    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(&error_local);
    if(error_local!=CL_SUCCESS) {
        detail::errHandler(error_local, __LINK_PROGRAM_ERR);
    }

    cl_program prog = ::clLinkProgram(
        ctx(),
        0,
        nullptr,
        options,
        2,
        programs,
        notifyFptr,
        data,
        &error_local);

    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
    if (err != nullptr) {
        *err = error_local;
    }

    return Program(prog);
}

inline Program linkProgram(
    const Program& input1,
    const Program& input2,
    const string& options,
    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
    void* data = nullptr,
    cl_int* err = nullptr)
{
    return linkProgram(input1, input2, options.c_str(), notifyFptr, data, err);
}

inline Program linkProgram(
    const vector<Program>& inputPrograms,
    const char* options = nullptr,
    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
    void* data = nullptr,
    cl_int* err = nullptr)
{
    cl_int error_local = CL_SUCCESS;
    Context ctx;

    static_assert(sizeof(cl::Program) == sizeof(cl_program),
        "Size of cl::Program must be equal to size of cl_program");

    if(inputPrograms.size() > 0) {
        ctx = inputPrograms[0].getInfo<CL_PROGRAM_CONTEXT>(&error_local);
        if(error_local!=CL_SUCCESS) {
            detail::errHandler(error_local, __LINK_PROGRAM_ERR);
        }
    }

    cl_program prog = ::clLinkProgram(
        ctx(),
        0,
        nullptr,
        options,
        static_cast<cl_uint>(inputPrograms.size()),
        reinterpret_cast<const cl_program *>(inputPrograms.data()),
        notifyFptr,
        data,
        &error_local);

    detail::errHandler(error_local,__COMPILE_PROGRAM_ERR);
    if (err != nullptr) {
        *err = error_local;
    }

    return Program(prog);
}

inline Program linkProgram(
    const vector<Program>& inputPrograms,
    const string& options,
    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = nullptr,
    void* data = nullptr,
    cl_int* err = nullptr)
{
    return linkProgram(inputPrograms, options.c_str(), notifyFptr, data, err);
}
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

// Template specialization for CL_PROGRAM_BINARIES
template <>
inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned char>>* param) const
{
    if (name != CL_PROGRAM_BINARIES) {
        return CL_INVALID_VALUE;
    }
    if (param) {
        // Resize the parameter array appropriately for each allocation
        // and pass down to the helper

        vector<size_type> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
        size_type numBinaries = sizes.size();

        // Resize the parameter array and constituent arrays
        param->resize(numBinaries);
        for (size_type i = 0; i < numBinaries; ++i) {
            (*param)[i].resize(sizes[i]);
        }

        return detail::errHandler(
            detail::getInfo(&::clGetProgramInfo, object_, name, param),
            __GET_PROGRAM_INFO_ERR);
    }

    return CL_SUCCESS;
}

template<>
inline vector<vector<unsigned char>> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
{
    vector<vector<unsigned char>> binariesVectors;

    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors);
    if (err != nullptr) {
        *err = result;
    }
    return binariesVectors;
}

#if CL_HPP_TARGET_OPENCL_VERSION >= 220
// Template specialization for clSetProgramSpecializationConstant
template <>
inline cl_int cl::Program::setSpecializationConstant(cl_uint index, const bool &value)
{
    cl_uchar ucValue = value ? CL_UCHAR_MAX : 0;
    return detail::errHandler(
        ::clSetProgramSpecializationConstant(
            object_,
            index,
            sizeof(ucValue),
            &ucValue),
        __SET_PROGRAM_SPECIALIZATION_CONSTANT_ERR);
}
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 220

inline Kernel::Kernel(const Program& program, const string& name, cl_int* err)
{
    cl_int error;

    object_ = ::clCreateKernel(program(), name.c_str(), &error);
    detail::errHandler(error, __CREATE_KERNEL_ERR);

    if (err != nullptr) {
        *err = error;
    }
}

inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
{
    cl_int error;

    object_ = ::clCreateKernel(program(), name, &error);
    detail::errHandler(error, __CREATE_KERNEL_ERR);

    if (err != nullptr) {
        *err = error;
    }
}

#ifdef cl_khr_external_memory
enum class ExternalMemoryType : cl_external_memory_handle_type_khr
{
    None = 0,
#ifdef cl_khr_external_memory_opaque_fd
    OpaqueFd = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR,
#endif // cl_khr_external_memory_opaque_fd
#ifdef cl_khr_external_memory_win32
    OpaqueWin32 = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR,
    OpaqueWin32Kmt = CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR,
#endif // cl_khr_external_memory_win32
#ifdef cl_khr_external_memory_dma_buf
    DmaBuf = CL_EXTERNAL_MEMORY_HANDLE_DMA_BUF_KHR,
#endif // cl_khr_external_memory_dma_buf
};
#endif // cl_khr_external_memory

enum class QueueProperties : cl_command_queue_properties
{
    None = 0,
    Profiling = CL_QUEUE_PROFILING_ENABLE,
    OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
};

inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs)
{
    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
}

inline QueueProperties operator&(QueueProperties lhs, QueueProperties rhs)
{
    return static_cast<QueueProperties>(static_cast<cl_command_queue_properties>(lhs) & static_cast<cl_command_queue_properties>(rhs));
}

/*! \class CommandQueue
 * \brief CommandQueue interface for cl_command_queue.
 */
class CommandQueue : public detail::Wrapper<cl_command_queue>
{
private:
    static std::once_flag default_initialized_;
    static CommandQueue default_;
    static cl_int default_error_;

    /*! \brief Create the default command queue returned by @ref getDefault.
     *
     * It sets default_error_ to indicate success or failure. It does not throw
     * @c cl::Error.
     */
    static void makeDefault()
    {
        /* We don't want to throw an error from this function, so we have to
         * catch and set the error flag.
         */
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        try
#endif
        {
            int error;
            Context context = Context::getDefault(&error);

            if (error != CL_SUCCESS) {
                default_error_ = error;
            }
            else {
                Device device = Device::getDefault();
                default_ = CommandQueue(context, device, 0, &default_error_);
            }
        }
#if defined(CL_HPP_ENABLE_EXCEPTIONS)
        catch (cl::Error &e) {
            default_error_ = e.err();
        }
#endif
    }

    /*! \brief Create the default command queue.
     *
     * This sets @c default_. It does not throw
     * @c cl::Error.
     */
    static void makeDefaultProvided(const CommandQueue &c) {
        default_ = c;
    }

#ifdef cl_khr_external_memory
    static std::once_flag ext_memory_initialized_;

    static void initMemoryExtension(const cl::Device& device) 
    {
        auto platform = device.getInfo<CL_DEVICE_PLATFORM>()();

        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireExternalMemObjectsKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseExternalMemObjectsKHR);

        if ((pfn_clEnqueueAcquireExternalMemObjectsKHR == nullptr)
            && (pfn_clEnqueueReleaseExternalMemObjectsKHR == nullptr))
        {
            detail::errHandler(CL_INVALID_VALUE, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);
        }
    }
#endif // cl_khr_external_memory

public:
#ifdef CL_HPP_UNIT_TEST_ENABLE
    /*! \brief Reset the default.
    *
    * This sets @c default_ to an empty value to support cleanup in
    * the unit test framework.
    * This function is not thread safe.
    */
    static void unitTestClearDefault() {
        default_ = CommandQueue();
    }
#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE
        

    /*!
     * \brief Constructs a CommandQueue based on passed properties.
     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
     */
   CommandQueue(
        cl_command_queue_properties properties,
        cl_int* err = nullptr)
    {
        cl_int error;

        Context context = Context::getDefault(&error);
        detail::errHandler(error, __CREATE_CONTEXT_ERR);

        if (error != CL_SUCCESS) {
            if (err != nullptr) {
                *err = error;
            }
        }
        else {
            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
            bool useWithProperties;

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
            // Run-time decision based on the actual platform
            {
                cl_uint version = detail::getContextPlatformVersion(context());
                useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
            }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
            useWithProperties = true;
#else
            useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
            if (useWithProperties) {
                cl_queue_properties queue_properties[] = {
                    CL_QUEUE_PROPERTIES, properties, 0 };
                if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
                    object_ = ::clCreateCommandQueueWithProperties(
                        context(), device(), queue_properties, &error);
                }
                else {
                    error = CL_INVALID_QUEUE_PROPERTIES;
                }

                detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
                if (err != nullptr) {
                    *err = error;
                }
            }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
            if (!useWithProperties) {
                object_ = ::clCreateCommandQueue(
                    context(), device(), properties, &error);

                detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
                if (err != nullptr) {
                    *err = error;
                }
            }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
        }
    }

   /*!
    * \brief Constructs a CommandQueue based on passed properties.
    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
    */
   CommandQueue(
       QueueProperties properties,
       cl_int* err = nullptr)
   {
       cl_int error;

       Context context = Context::getDefault(&error);
       detail::errHandler(error, __CREATE_CONTEXT_ERR);

       if (error != CL_SUCCESS) {
           if (err != nullptr) {
               *err = error;
           }
       }
       else {
           Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
           bool useWithProperties;

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
           // Run-time decision based on the actual platform
           {
               cl_uint version = detail::getContextPlatformVersion(context());
               useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
           }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
           useWithProperties = true;
#else
           useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
           if (useWithProperties) {
               cl_queue_properties queue_properties[] = {
                   CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };

               object_ = ::clCreateCommandQueueWithProperties(
                   context(), device(), queue_properties, &error);

               detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
               if (err != nullptr) {
                   *err = error;
               }
           }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
           if (!useWithProperties) {
               object_ = ::clCreateCommandQueue(
                   context(), device(), static_cast<cl_command_queue_properties>(properties), &error);

               detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
               if (err != nullptr) {
                   *err = error;
               }
           }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200

       }
   }

    /*!
     * \brief Constructs a CommandQueue for an implementation defined device in the given context
     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
     */
    explicit CommandQueue(
        const Context& context,
        cl_command_queue_properties properties = 0,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useWithProperties;
        vector<cl::Device> devices;
        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);

        detail::errHandler(error, __CREATE_CONTEXT_ERR);

        if (error != CL_SUCCESS)
        {
            if (err != nullptr) {
                *err = error;
            }
            return;
        }

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
        useWithProperties = true;
#else
        useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        if (useWithProperties) {
            cl_queue_properties queue_properties[] = {
                CL_QUEUE_PROPERTIES, properties, 0 };
            if ((properties & CL_QUEUE_ON_DEVICE) == 0) {
                object_ = ::clCreateCommandQueueWithProperties(
                    context(), devices[0](), queue_properties, &error);
            }
            else {
                error = CL_INVALID_QUEUE_PROPERTIES;
            }

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
        if (!useWithProperties) {
            object_ = ::clCreateCommandQueue(
                context(), devices[0](), properties, &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
    }

    /*!
    * \brief Constructs a CommandQueue for an implementation defined device in the given context
    * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
    */
    explicit CommandQueue(
        const Context& context,
        QueueProperties properties,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useWithProperties;
        vector<cl::Device> devices;
        error = context.getInfo(CL_CONTEXT_DEVICES, &devices);

        detail::errHandler(error, __CREATE_CONTEXT_ERR);

        if (error != CL_SUCCESS)
        {
            if (err != nullptr) {
                *err = error;
            }
            return;
        }

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
        useWithProperties = true;
#else
        useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        if (useWithProperties) {
            cl_queue_properties queue_properties[] = {
                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
            object_ = ::clCreateCommandQueueWithProperties(
                context(), devices[0](), queue_properties, &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
        if (!useWithProperties) {
            object_ = ::clCreateCommandQueue(
                context(), devices[0](), static_cast<cl_command_queue_properties>(properties), &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
    }

    /*!
     * \brief Constructs a CommandQueue for a passed device and context
     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
     */
    CommandQueue(
        const Context& context,
        const Device& device,
        cl_command_queue_properties properties = 0,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useWithProperties;

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
        useWithProperties = true;
#else
        useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        if (useWithProperties) {
            cl_queue_properties queue_properties[] = {
                CL_QUEUE_PROPERTIES, properties, 0 };
            object_ = ::clCreateCommandQueueWithProperties(
                context(), device(), queue_properties, &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
        if (!useWithProperties) {
            object_ = ::clCreateCommandQueue(
                context(), device(), properties, &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
    }

    /*!
     * \brief Constructs a CommandQueue for a passed device and context
     * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified.
     */
    CommandQueue(
        const Context& context,
        const Device& device,
        QueueProperties properties,
        cl_int* err = nullptr)
    {
        cl_int error;
        bool useWithProperties;

#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200
        // Run-time decision based on the actual platform
        {
            cl_uint version = detail::getContextPlatformVersion(context());
            useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above
        }
#elif CL_HPP_TARGET_OPENCL_VERSION >= 200
        useWithProperties = true;
#else
        useWithProperties = false;
#endif

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        if (useWithProperties) {
            cl_queue_properties queue_properties[] = {
                CL_QUEUE_PROPERTIES, static_cast<cl_queue_properties>(properties), 0 };
            object_ = ::clCreateCommandQueueWithProperties(
                context(), device(), queue_properties, &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
#if CL_HPP_MINIMUM_OPENCL_VERSION < 200
        if (!useWithProperties) {
            object_ = ::clCreateCommandQueue(
                context(), device(), static_cast<cl_command_queue_properties>(properties), &error);

            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
            if (err != nullptr) {
                *err = error;
            }
        }
#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200
    }

    static CommandQueue getDefault(cl_int * err = nullptr) 
    {
        std::call_once(default_initialized_, makeDefault);
#if CL_HPP_TARGET_OPENCL_VERSION >= 200
        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
#else // CL_HPP_TARGET_OPENCL_VERSION >= 200
        detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR);
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200
        if (err != nullptr) {
            *err = default_error_;
        }
        return default_;
    }

    /**
     * Modify the default command queue to be used by
     * subsequent operations.
     * Will only set the default if no default was previously created.
     * @return updated default command queue.
     *         Should be compared to the passed value to ensure that it was updated.
     */
    static CommandQueue setDefault(const CommandQueue &default_queue)
    {
        std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue));
        detail::errHandler(default_error_);
        return default_;
    }

    CommandQueue() { }


    /*! \brief Constructor from cl_command_queue - takes ownership.
     *
     * \param retainObject will cause the constructor to retain its cl object.
     *                     Defaults to false to maintain compatibility with
     *                     earlier versions.
     */
    explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : 
        detail::Wrapper<cl_type>(commandQueue, retainObject) { }

    CommandQueue& operator = (const cl_command_queue& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    template <typename T>
    cl_int getInfo(cl_command_queue_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(
                &::clGetCommandQueueInfo, object_, name, param),
                __GET_COMMAND_QUEUE_INFO_ERR);
    }

    template <cl_command_queue_info name> typename
    detail::param_traits<detail::cl_command_queue_info, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_command_queue_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    cl_int enqueueReadBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        size_type offset,
        size_type size,
        void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueReadBuffer(
                object_, buffer(), blocking, offset, size,
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_READ_BUFFER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueWriteBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        size_type offset,
        size_type size,
        const void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueWriteBuffer(
                object_, buffer(), blocking, offset, size,
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
                __ENQUEUE_WRITE_BUFFER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueCopyBuffer(
        const Buffer& src,
        const Buffer& dst,
        size_type src_offset,
        size_type dst_offset,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueCopyBuffer(
                object_, src(), dst(), src_offset, dst_offset, size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQEUE_COPY_BUFFER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#if CL_HPP_TARGET_OPENCL_VERSION >= 110
    cl_int enqueueReadBufferRect(
        const Buffer& buffer,
        cl_bool blocking,
        const array<size_type, 3>& buffer_offset,
        const array<size_type, 3>& host_offset,
        const array<size_type, 3>& region,
        size_type buffer_row_pitch,
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
        void *ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueReadBufferRect(
                object_, 
                buffer(), 
                blocking,
                buffer_offset.data(),
                host_offset.data(),
                region.data(),
                buffer_row_pitch,
                buffer_slice_pitch,
                host_row_pitch,
                host_slice_pitch,
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
                __ENQUEUE_READ_BUFFER_RECT_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueReadBufferRect(
        const Buffer& buffer,
        cl_bool blocking,
        const array<size_type, 2>& buffer_offset,
        const array<size_type, 2>& host_offset,
        const array<size_type, 2>& region,
        size_type buffer_row_pitch,
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
        void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    { 
        return enqueueReadBufferRect(
            buffer,
            blocking,
            { buffer_offset[0], buffer_offset[1], 0 },
            { host_offset[0], host_offset[1], 0 },
            { region[0], region[1], 1 },
            buffer_row_pitch,
            buffer_slice_pitch,
            host_row_pitch,
            host_slice_pitch,
            ptr,
            events,
            event);
    }

    cl_int enqueueWriteBufferRect(
        const Buffer& buffer,
        cl_bool blocking,
        const array<size_type, 3>& buffer_offset,
        const array<size_type, 3>& host_offset,
        const array<size_type, 3>& region,
        size_type buffer_row_pitch,
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
        const void *ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueWriteBufferRect(
                object_, 
                buffer(), 
                blocking,
                buffer_offset.data(),
                host_offset.data(),
                region.data(),
                buffer_row_pitch,
                buffer_slice_pitch,
                host_row_pitch,
                host_slice_pitch,
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
                __ENQUEUE_WRITE_BUFFER_RECT_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueWriteBufferRect(
        const Buffer& buffer,
        cl_bool blocking,
        const array<size_type, 2>& buffer_offset,
        const array<size_type, 2>& host_offset,
        const array<size_type, 2>& region,
        size_type buffer_row_pitch,
        size_type buffer_slice_pitch,
        size_type host_row_pitch,
        size_type host_slice_pitch,
        const void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueWriteBufferRect(
            buffer, 
            blocking,
            { buffer_offset[0], buffer_offset[1], 0 },
            { host_offset[0], host_offset[1], 0 },
            { region[0], region[1], 1 },
            buffer_row_pitch,
            buffer_slice_pitch,
            host_row_pitch,
            host_slice_pitch,
            ptr,
            events,
            event);
    }

    cl_int enqueueCopyBufferRect(
        const Buffer& src,
        const Buffer& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        size_type src_row_pitch,
        size_type src_slice_pitch,
        size_type dst_row_pitch,
        size_type dst_slice_pitch,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueCopyBufferRect(
                object_, 
                src(), 
                dst(), 
                src_origin.data(),
                dst_origin.data(),
                region.data(),
                src_row_pitch,
                src_slice_pitch,
                dst_row_pitch,
                dst_slice_pitch,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQEUE_COPY_BUFFER_RECT_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueCopyBufferRect(
        const Buffer& src,
        const Buffer& dst,
        const array<size_type, 2>& src_origin,
        const array<size_type, 2>& dst_origin,
        const array<size_type, 2>& region,
        size_type src_row_pitch,
        size_type src_slice_pitch,
        size_type dst_row_pitch,
        size_type dst_slice_pitch,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueCopyBufferRect(
            src,
            dst,
            { src_origin[0], src_origin[1], 0 },
            { dst_origin[0], dst_origin[1], 0 },
            { region[0], region[1], 1 },
            src_row_pitch,
            src_slice_pitch,
            dst_row_pitch,
            dst_slice_pitch,
            events,
            event);
    }

#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    /**
     * Enqueue a command to fill a buffer object with a pattern
     * of a given size. The pattern is specified as a vector type.
     * \tparam PatternType The datatype of the pattern field. 
     *     The pattern type must be an accepted OpenCL data type.
     * \tparam offset Is the offset in bytes into the buffer at 
     *     which to start filling. This must be a multiple of 
     *     the pattern size.
     * \tparam size Is the size in bytes of the region to fill.
     *     This must be a multiple of the pattern size.
     */
    template<typename PatternType>
    cl_int enqueueFillBuffer(
        const Buffer& buffer,
        PatternType pattern,
        size_type offset,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueFillBuffer(
                object_, 
                buffer(),
                static_cast<void*>(&pattern),
                sizeof(PatternType), 
                offset, 
                size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
                __ENQUEUE_FILL_BUFFER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

    cl_int enqueueReadImage(
        const Image& image,
        cl_bool blocking,
        const array<size_type, 3>& origin,
        const array<size_type, 3>& region,
        size_type row_pitch,
        size_type slice_pitch,
        void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueReadImage(
                object_, 
                image(), 
                blocking, 
                origin.data(),
                region.data(), 
                row_pitch, 
                slice_pitch, 
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_READ_IMAGE_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueReadImage(
        const Image& image,
        cl_bool blocking,
        const array<size_type, 2>& origin,
        const array<size_type, 2>& region,
        size_type row_pitch,
        size_type slice_pitch,
        void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueReadImage(
            image,
            blocking,
            { origin[0], origin[1], 0 },
            { region[0], region[1], 1 },
            row_pitch,
            slice_pitch,
            ptr,
            events,
            event);
    }

    cl_int enqueueWriteImage(
        const Image& image,
        cl_bool blocking,
        const array<size_type, 3>& origin,
        const array<size_type, 3>& region,
        size_type row_pitch,
        size_type slice_pitch,
        const void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueWriteImage(
                object_, 
                image(), 
                blocking, 
                origin.data(),
                region.data(), 
                row_pitch, 
                slice_pitch, 
                ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_WRITE_IMAGE_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueWriteImage(
        const Image& image,
        cl_bool blocking,
        const array<size_type, 2>& origin,
        const array<size_type, 2>& region,
        size_type row_pitch,
        size_type slice_pitch,
        const void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueWriteImage(
            image,
            blocking,
            { origin[0], origin[1], 0 },
            { region[0], region[1], 1 },
            row_pitch,
            slice_pitch,
            ptr,
            events,
            event);
    }

    cl_int enqueueCopyImage(
        const Image& src,
        const Image& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueCopyImage(
                object_, 
                src(), 
                dst(), 
                src_origin.data(),
                dst_origin.data(), 
                region.data(),
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_COPY_IMAGE_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueCopyImage(
        const Image& src,
        const Image& dst,
        const array<size_type, 2>& src_origin,
        const array<size_type, 2>& dst_origin,
        const array<size_type, 2>& region,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueCopyImage(
            src,
            dst,
            { src_origin[0], src_origin[1], 0 },
            { dst_origin[0], dst_origin[1], 0 },
            { region[0], region[1], 1 },
            events,
            event);
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    /**
     * Enqueue a command to fill an image object with a specified color.
     * \param fillColor is the color to use to fill the image.
     *     This is a four component RGBA floating-point, signed integer
     *     or unsigned integer color value if  the image channel data
     *     type is an unnormalized signed integer type.   
     */
    template <typename T>
    typename std::enable_if<std::is_same<T, cl_float4>::value ||
                            std::is_same<T, cl_int4  >::value ||
                            std::is_same<T, cl_uint4 >::value,
                            cl_int>::type 
     enqueueFillImage(
         const Image& image, 
         T fillColor,
         const array<size_type, 3>& origin,
         const array<size_type, 3>& region,
         const vector<Event>* events = nullptr,
         Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueFillImage(
                object_,
                image(),
                static_cast<void*>(&fillColor),
                origin.data(),
                region.data(),
                (events != nullptr) ? (cl_uint)events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : NULL,
                (event != NULL) ? &tmp : nullptr),
            __ENQUEUE_FILL_IMAGE_ERR);

        if (event != nullptr && err == CL_SUCCESS) *event = tmp;

        return err;
    }

   /**
     * Enqueue a command to fill an image object with a specified color.
     * \param fillColor is the color to use to fill the image.
     *     This is a four component RGBA floating-point, signed integer
     *     or unsigned integer color value if  the image channel data
     *     type is an unnormalized signed integer type.
     */
    template <typename T>
    typename std::enable_if<std::is_same<T, cl_float4>::value ||
                            std::is_same<T, cl_int4  >::value ||
                            std::is_same<T, cl_uint4 >::value, cl_int>::type
    enqueueFillImage(
        const Image& image,
        T fillColor,
        const array<size_type, 2>& origin,
        const array<size_type, 2>& region,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueFillImage(
            image,
            fillColor,
            { origin[0], origin[1], 0 },
            { region[0], region[1], 1 },
            events,
            event
            );
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120

    cl_int enqueueCopyImageToBuffer(
        const Image& src,
        const Buffer& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& region,
        size_type dst_offset,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueCopyImageToBuffer(
                object_, 
                src(), 
                dst(), 
                src_origin.data(),
                region.data(), 
                dst_offset,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueCopyImageToBuffer(
        const Image& src,
        const Buffer& dst,
        const array<size_type, 2>& src_origin,
        const array<size_type, 2>& region,
        size_type dst_offset,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    { 
        return enqueueCopyImageToBuffer(
            src,
            dst,
            { src_origin[0], src_origin[1], 0 },
            { region[0], region[1], 1 },
            dst_offset,
            events,
            event);
    }

    cl_int enqueueCopyBufferToImage(
        const Buffer& src,
        const Image& dst,
        size_type src_offset,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueCopyBufferToImage(
                object_, 
                src(), 
                dst(), 
                src_offset,
                dst_origin.data(), 
                region.data(),
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueCopyBufferToImage(
        const Buffer& src,
        const Image& dst,
        size_type src_offset,
        const array<size_type, 2>& dst_origin,
        const array<size_type, 2>& region,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueCopyBufferToImage(
            src,
            dst, 
            src_offset,
            { dst_origin[0], dst_origin[1], 0 },
            { region[0], region[1], 1 },
            events,
            event);
    }

    void* enqueueMapBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        cl_map_flags flags,
        size_type offset,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr,
        cl_int* err = nullptr) const
    {
        cl_event tmp;
        cl_int error;
        void * result = ::clEnqueueMapBuffer(
            object_, buffer(), blocking, flags, offset, size,
            (events != nullptr) ? (cl_uint) events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr,
            &error);

        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
        if (event != nullptr && error == CL_SUCCESS)
            *event = tmp;

        return result;
    }

    void* enqueueMapImage(
        const Image& image,
        cl_bool blocking,
        cl_map_flags flags,
        const array<size_type, 3>& origin,
        const array<size_type, 3>& region,
        size_type * row_pitch,
        size_type * slice_pitch,
        const vector<Event>* events = nullptr,
        Event* event = nullptr,
        cl_int* err = nullptr) const
    {
        cl_event tmp;
        cl_int error;
        void * result = ::clEnqueueMapImage(
            object_, image(), blocking, flags,
            origin.data(), 
            region.data(),
            row_pitch, slice_pitch,
            (events != nullptr) ? (cl_uint) events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr,
            &error);

        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
        if (err != nullptr) {
              *err = error;
        }
        if (event != nullptr && error == CL_SUCCESS)
            *event = tmp;
        return result;
    }

    void* enqueueMapImage(
         const Image& image,
         cl_bool blocking,
         cl_map_flags flags,
         const array<size_type, 2>& origin,
         const array<size_type, 2>& region,
         size_type* row_pitch,
         size_type* slice_pitch,
         const vector<Event>* events = nullptr,
         Event* event = nullptr,
         cl_int* err = nullptr) const
    {
        return enqueueMapImage(image, blocking, flags,
                               { origin[0], origin[1], 0 },
                               { region[0], region[1], 1 }, row_pitch,
                               slice_pitch, events, event, err);
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 200

    /**
    * Enqueues a command that copies a region of memory from the source pointer to the destination pointer.
    * This function is specifically for transferring data between the host and a coarse-grained SVM buffer.
    */
    template<typename T>
    cl_int enqueueMemcpySVM(
            T *dst_ptr,
            const T *src_ptr,
            cl_bool blocking,
            size_type size,
            const vector<Event> *events = nullptr,
            Event *event = nullptr) const {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
                object_, blocking, static_cast<void *>(dst_ptr), static_cast<const void *>(src_ptr), size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
    *Enqueues a command that will copy data from one coarse-grained SVM buffer to another.
    *This function takes two cl::pointer instances representing the destination and source buffers.
    */
    template<typename T, class D>
    cl_int enqueueMemcpySVM(
            cl::pointer<T, D> &dst_ptr,
            const cl::pointer<T, D> &src_ptr,
            cl_bool blocking,
            size_type size,
            const vector<Event> *events = nullptr,
            Event *event = nullptr) const {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
                object_, blocking, static_cast<void *>(dst_ptr.get()), static_cast<const void *>(src_ptr.get()),
                size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
    * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
    * This variant takes a cl::vector instance.
    */
    template<typename T, class Alloc>
    cl_int enqueueMemcpySVM(
            cl::vector<T, Alloc> &dst_container,
            const cl::vector<T, Alloc> &src_container,
            cl_bool blocking,
            const vector<Event> *events = nullptr,
            Event *event = nullptr) const {
        cl_event tmp;
        if(src_container.size() != dst_container.size()){
            return detail::errHandler(CL_INVALID_VALUE,__ENQUEUE_COPY_SVM_ERR);
        }
        cl_int err = detail::errHandler(::clEnqueueSVMMemcpy(
                object_, blocking, static_cast<void *>(dst_container.data()),
                static_cast<const void *>(src_container.data()),
                dst_container.size() * sizeof(T),
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != NULL) ? &tmp : nullptr), __ENQUEUE_COPY_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
    * Enqueues a command to fill a SVM buffer with a pattern.
    *
    */
    template<typename T, typename PatternType>
    cl_int enqueueMemFillSVM(
            T *ptr,
            PatternType pattern,
            size_type size,
            const vector<Event> *events = nullptr,
            Event *event = nullptr) const {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
                object_, static_cast<void *>(ptr), static_cast<void *>(&pattern),
                sizeof(PatternType), size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
    * Enqueues a command that fills a region of a coarse-grained SVM buffer with a specified pattern.
    * This variant takes a cl::pointer instance.
    */
    template<typename T, class D, typename PatternType>
    cl_int enqueueMemFillSVM(
            cl::pointer<T, D> &ptr,
            PatternType pattern,
            size_type size,
            const vector<Event> *events = nullptr,
            Event *event = nullptr) const {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
                object_, static_cast<void *>(ptr.get()), static_cast<void *>(&pattern),
                sizeof(PatternType), size,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr), __ENQUEUE_FILL_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
    * Enqueues a command that will allow the host to fill a region of a coarse-grained SVM buffer with a specified pattern.
    * This variant takes a cl::vector instance.
    */
    template<typename T, class Alloc, typename PatternType>
    cl_int enqueueMemFillSVM(
            cl::vector<T, Alloc> &container,
            PatternType pattern,
            const vector<Event> *events = nullptr,
            Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMemFill(
                object_, static_cast<void *>(container.data()), static_cast<void *>(&pattern),
                sizeof(PatternType), container.size() * sizeof(T),
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event *) &events->front() : nullptr,
                (event != nullptr) ? &tmp : NULL), __ENQUEUE_FILL_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
     * This variant takes a raw SVM pointer.
     */
    template<typename T>
    cl_int enqueueMapSVM(
        T* ptr,
        cl_bool blocking,
        cl_map_flags flags,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMap(
            object_, blocking, flags, static_cast<void*>(ptr), size,
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }


    /**
     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
     * This variant takes a cl::pointer instance.
     */
    template<typename T, class D>
    cl_int enqueueMapSVM(
        cl::pointer<T, D> &ptr,
        cl_bool blocking,
        cl_map_flags flags,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMap(
            object_, blocking, flags, static_cast<void*>(ptr.get()), size,
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer.
     * This variant takes a cl::vector instance.
     */
    template<typename T, class Alloc>
    cl_int enqueueMapSVM(
        cl::vector<T, Alloc> &container,
        cl_bool blocking,
        cl_map_flags flags,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMap(
            object_, blocking, flags, static_cast<void*>(container.data()), container.size()*sizeof(T),
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

    cl_int enqueueUnmapMemObject(
        const Memory& memory,
        void* mapped_ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueUnmapMemObject(
                object_, memory(), mapped_ptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
    /**
     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
     * This variant takes a raw SVM pointer.
     */
    template<typename T>
    cl_int enqueueUnmapSVM(
        T* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueSVMUnmap(
            object_, static_cast<void*>(ptr),
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_UNMAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
     * This variant takes a cl::pointer instance.
     */
    template<typename T, class D>
    cl_int enqueueUnmapSVM(
        cl::pointer<T, D> &ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueSVMUnmap(
            object_, static_cast<void*>(ptr.get()),
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_UNMAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime.
     * This variant takes a cl::vector instance.
     */
    template<typename T, class Alloc>
    cl_int enqueueUnmapSVM(
        cl::vector<T, Alloc> &container,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueSVMUnmap(
            object_, static_cast<void*>(container.data()),
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_UNMAP_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

#if CL_HPP_TARGET_OPENCL_VERSION >= 120
    /**
     * Enqueues a marker command which waits for either a list of events to complete, 
     * or all previously enqueued commands to complete.
     *
     * Enqueues a marker command which waits for either a list of events to complete, 
     * or if the list is empty it waits for all commands previously enqueued in command_queue 
     * to complete before it completes. This command returns an event which can be waited on, 
     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
     * or all previously enqueued commands, queued before this command to command_queue, 
     * have completed.
     */
    cl_int enqueueMarkerWithWaitList(
        const vector<Event> *events = nullptr,
        Event *event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueMarkerWithWaitList(
                object_,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MARKER_WAIT_LIST_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * A synchronization point that enqueues a barrier operation.
     *
     * Enqueues a barrier command which waits for either a list of events to complete, 
     * or if the list is empty it waits for all commands previously enqueued in command_queue 
     * to complete before it completes. This command blocks command execution, that is, any 
     * following commands enqueued after it do not execute until it completes. This command 
     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
     * all events either in the event_wait_list or all previously enqueued commands, queued 
     * before this command to command_queue, have completed.
     */
    cl_int enqueueBarrierWithWaitList(
        const vector<Event> *events = nullptr,
        Event *event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueBarrierWithWaitList(
                object_,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_BARRIER_WAIT_LIST_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
    
    /**
     * Enqueues a command to indicate with which device a set of memory objects
     * should be associated.
     */
    cl_int enqueueMigrateMemObjects(
        const vector<Memory> &memObjects,
        cl_mem_migration_flags flags,
        const vector<Event>* events = nullptr,
        Event* event = nullptr
        ) const
    {
        cl_event tmp;
        
        vector<cl_mem> localMemObjects(memObjects.size());

        for( int i = 0; i < (int)memObjects.size(); ++i ) {
            localMemObjects[i] = memObjects[i]();
        }
        
        cl_int err = detail::errHandler(
            ::clEnqueueMigrateMemObjects(
                object_, 
                (cl_uint)memObjects.size(), 
                localMemObjects.data(),
                flags,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120


#if CL_HPP_TARGET_OPENCL_VERSION >= 210
    /**
     * Enqueues a command that will allow the host associate ranges within a set of
     * SVM allocations with a device.
     * @param sizes - The length from each pointer to migrate.
     */
    template<typename T>
    cl_int enqueueMigrateSVM(
        const cl::vector<T*> &svmRawPointers,
        const cl::vector<size_type> &sizes,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(::clEnqueueSVMMigrateMem(
            object_,
            svmRawPointers.size(), static_cast<void**>(svmRawPointers.data()),
            sizes.data(), // array of sizes not passed
            flags,
            (events != nullptr) ? (cl_uint)events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
            (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MIGRATE_SVM_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    /**
     * Enqueues a command that will allow the host associate a set of SVM allocations with
     * a device.
     */
    template<typename T>
    cl_int enqueueMigrateSVM(
        const cl::vector<T*> &svmRawPointers,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueMigrateSVM(svmRawPointers, cl::vector<size_type>(svmRawPointers.size()), flags, events, event);
    }


    /**
     * Enqueues a command that will allow the host associate ranges within a set of
     * SVM allocations with a device.
     * @param sizes - The length from each pointer to migrate.
     */
    template<typename T, class D>
    cl_int enqueueMigrateSVM(
        const cl::vector<cl::pointer<T, D>> &svmPointers,
        const cl::vector<size_type> &sizes,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl::vector<void*> svmRawPointers;
        svmRawPointers.reserve(svmPointers.size());
        for (auto p : svmPointers) {
            svmRawPointers.push_back(static_cast<void*>(p.get()));
        }

        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
    }


    /**
     * Enqueues a command that will allow the host associate a set of SVM allocations with
     * a device.
     */
    template<typename T, class D>
    cl_int enqueueMigrateSVM(
        const cl::vector<cl::pointer<T, D>> &svmPointers,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueMigrateSVM(svmPointers, cl::vector<size_type>(svmPointers.size()), flags, events, event);
    }

    /**
     * Enqueues a command that will allow the host associate ranges within a set of
     * SVM allocations with a device.
     * @param sizes - The length from the beginning of each container to migrate.
     */
    template<typename T, class Alloc>
    cl_int enqueueMigrateSVM(
        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
        const cl::vector<size_type> &sizes,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl::vector<void*> svmRawPointers;
        svmRawPointers.reserve(svmContainers.size());
        for (auto p : svmContainers) {
            svmRawPointers.push_back(static_cast<void*>(p.data()));
        }

        return enqueueMigrateSVM(svmRawPointers, sizes, flags, events, event);
    }

    /**
     * Enqueues a command that will allow the host associate a set of SVM allocations with
     * a device.
     */
    template<typename T, class Alloc>
    cl_int enqueueMigrateSVM(
        const cl::vector<cl::vector<T, Alloc>> &svmContainers,
        cl_mem_migration_flags flags = 0,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        return enqueueMigrateSVM(svmContainers, cl::vector<size_type>(svmContainers.size()), flags, events, event);
    }

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
    
    cl_int enqueueNDRangeKernel(
        const Kernel& kernel,
        const NDRange& offset,
        const NDRange& global,
        const NDRange& local = NullRange,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueNDRangeKernel(
                object_, kernel(), (cl_uint) global.dimensions(),
                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
                (const size_type*) global,
                local.dimensions() != 0 ? (const size_type*) local : nullptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_NDRANGE_KERNEL_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
    CL_API_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask(
        const Kernel& kernel,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_2_DEPRECATED
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueTask(
                object_, kernel(),
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_TASK_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)

    cl_int enqueueNativeKernel(
        void (CL_CALLBACK *userFptr)(void *),
        std::pair<void*, size_type> args,
        const vector<Memory>* mem_objects = nullptr,
        const vector<const void*>* mem_locs = nullptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr) const
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueNativeKernel(
                object_, userFptr, args.first, args.second,
                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
                (mem_objects->size() > 0 ) ? reinterpret_cast<const cl_mem *>(mem_objects->data()) : nullptr,
                (mem_locs != nullptr && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : nullptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_NATIVE_KERNEL);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

/**
 * Deprecated APIs for 1.2
 */
#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
    CL_API_PREFIX__VERSION_1_1_DEPRECATED 
    cl_int enqueueMarker(Event* event = nullptr) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
    {
        cl_event tmp;
        cl_int err = detail::errHandler(
            ::clEnqueueMarker(
                object_, 
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_MARKER_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    CL_API_PREFIX__VERSION_1_1_DEPRECATED
    cl_int enqueueWaitForEvents(const vector<Event>& events) const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
    {
        return detail::errHandler(
            ::clEnqueueWaitForEvents(
                object_,
                (cl_uint) events.size(),
                events.size() > 0 ? (const cl_event*) &events.front() : nullptr),
            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
    }
#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)

    cl_int enqueueAcquireGLObjects(
         const vector<Memory>* mem_objects = nullptr,
         const vector<Event>* events = nullptr,
         Event* event = nullptr) const
     {
        cl_event tmp;
        cl_int err = detail::errHandler(
             ::clEnqueueAcquireGLObjects(
                 object_,
                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
                 (events != nullptr) ? (cl_uint) events->size() : 0,
                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                 (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_ACQUIRE_GL_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
     }

    cl_int enqueueReleaseGLObjects(
         const vector<Memory>* mem_objects = nullptr,
         const vector<Event>* events = nullptr,
         Event* event = nullptr) const
     {
        cl_event tmp;
        cl_int err = detail::errHandler(
             ::clEnqueueReleaseGLObjects(
                 object_,
                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
                 (events != nullptr) ? (cl_uint) events->size() : 0,
                 (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                 (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_RELEASE_GL_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
     }

#if defined (CL_HPP_USE_DX_INTEROP)
typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list, cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
    cl_command_queue command_queue, cl_uint num_objects,
    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
    const cl_event* event_wait_list, cl_event* event);

    cl_int enqueueAcquireD3D10Objects(
         const vector<Memory>* mem_objects = nullptr,
         const vector<Event>* events = nullptr,
         Event* event = nullptr) const
    {
        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = nullptr;
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR);
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR);
#endif
        
        cl_event tmp;
        cl_int err = detail::errHandler(
             pfn_clEnqueueAcquireD3D10ObjectsKHR(
                 object_,
                 (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
                 (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
                 (events != nullptr) ? (cl_uint) events->size() : 0,
                 (events != nullptr) ? (cl_event*) &events->front() : nullptr,
                 (event != nullptr) ? &tmp : nullptr),
             __ENQUEUE_ACQUIRE_GL_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
     }

    cl_int enqueueReleaseD3D10Objects(
         const vector<Memory>* mem_objects = nullptr,
         const vector<Event>* events = nullptr,
         Event* event = nullptr) const
    {
        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = nullptr;
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR);
#endif
#if CL_HPP_MINIMUM_OPENCL_VERSION < 120
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR);
#endif

        cl_event tmp;
        cl_int err = detail::errHandler(
            pfn_clEnqueueReleaseD3D10ObjectsKHR(
                object_,
                (mem_objects != nullptr) ? (cl_uint) mem_objects->size() : 0,
                (mem_objects != nullptr && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): nullptr,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr),
            __ENQUEUE_RELEASE_GL_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif

/**
 * Deprecated APIs for 1.2
 */
#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
    CL_API_PREFIX__VERSION_1_1_DEPRECATED
    cl_int enqueueBarrier() const CL_API_SUFFIX__VERSION_1_1_DEPRECATED
    {
        return detail::errHandler(
            ::clEnqueueBarrier(object_),
            __ENQUEUE_BARRIER_ERR);
    }
#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS

    cl_int flush() const
    {
        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
    }

    cl_int finish() const
    {
        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
    }

#ifdef cl_khr_external_memory
    cl_int enqueueAcquireExternalMemObjects(
        const vector<Memory>& mem_objects,
        const vector<Event>* events_wait = nullptr,
        Event *event = nullptr)
    {
        cl_int err = CL_INVALID_OPERATION;
        cl_event tmp;

        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());

        if (pfn_clEnqueueAcquireExternalMemObjectsKHR)
        {
            err = pfn_clEnqueueAcquireExternalMemObjectsKHR(
                object_,
                static_cast<cl_uint>(mem_objects.size()),
                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
                &tmp);
        }

        detail::errHandler(err, __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }

    cl_int enqueueReleaseExternalMemObjects(
        const vector<Memory>& mem_objects,
        const vector<Event>* events_wait = nullptr,
        Event *event = nullptr)
    {
        cl_int err = CL_INVALID_OPERATION;
        cl_event tmp;

        std::call_once(ext_memory_initialized_, initMemoryExtension, this->getInfo<CL_QUEUE_DEVICE>());

        if (pfn_clEnqueueReleaseExternalMemObjectsKHR)
        {
            err = pfn_clEnqueueReleaseExternalMemObjectsKHR(
                object_,
                static_cast<cl_uint>(mem_objects.size()),
                (mem_objects.size() > 0) ? reinterpret_cast<const cl_mem *>(mem_objects.data()) : nullptr,
                (events_wait != nullptr) ? static_cast<cl_uint>(events_wait->size()) : 0,
                (events_wait != nullptr && events_wait->size() > 0) ? reinterpret_cast<const cl_event*>(events_wait->data()) : nullptr,
                &tmp);
        }

        detail::errHandler(err, __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR);

        if (event != nullptr && err == CL_SUCCESS)
            *event = tmp;

        return err;
    }
#endif // cl_khr_external_memory && CL_HPP_TARGET_OPENCL_VERSION >= 300

#ifdef cl_khr_semaphore
    cl_int enqueueWaitSemaphores(
        const vector<Semaphore> &sema_objects,
        const vector<cl_semaphore_payload_khr> &sema_payloads = {},
        const vector<Event>* events_wait_list = nullptr,
        Event *event = nullptr) const;

    cl_int enqueueSignalSemaphores(
        const vector<Semaphore> &sema_objects,
        const vector<cl_semaphore_payload_khr>& sema_payloads = {},
        const vector<Event>* events_wait_list = nullptr,
        Event* event = nullptr);
#endif // cl_khr_semaphore
}; // CommandQueue

#ifdef cl_khr_external_memory
CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::ext_memory_initialized_;
#endif

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_;
CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_;
CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS;


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
enum class DeviceQueueProperties : cl_command_queue_properties
{
    None = 0,
    Profiling = CL_QUEUE_PROFILING_ENABLE,
};

inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs)
{
    return static_cast<DeviceQueueProperties>(static_cast<cl_command_queue_properties>(lhs) | static_cast<cl_command_queue_properties>(rhs));
}

/*! \class DeviceCommandQueue
 * \brief DeviceCommandQueue interface for device cl_command_queues.
 */
class DeviceCommandQueue : public detail::Wrapper<cl_command_queue>
{
public:

    /*!
     * Trivial empty constructor to create a null queue.
     */
    DeviceCommandQueue() { }

    /*!
     * Default construct device command queue on default context and device
     */
    DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = nullptr)
    {
        cl_int error;
        cl::Context context = cl::Context::getDefault();
        cl::Device device = cl::Device::getDefault();

        cl_command_queue_properties mergedProperties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);

        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
        object_ = ::clCreateCommandQueueWithProperties(
            context(), device(), queue_properties, &error);

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /*!
     * Create a device command queue for a specified device in the passed context.
     */
    DeviceCommandQueue(
        const Context& context,
        const Device& device,
        DeviceQueueProperties properties = DeviceQueueProperties::None,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_command_queue_properties mergedProperties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, mergedProperties, 0 };
        object_ = ::clCreateCommandQueueWithProperties(
            context(), device(), queue_properties, &error);

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /*!
     * Create a device command queue for a specified device in the passed context.
     */
    DeviceCommandQueue(
        const Context& context,
        const Device& device,
        cl_uint queueSize,
        DeviceQueueProperties properties = DeviceQueueProperties::None,
        cl_int* err = nullptr)
    {
        cl_int error;

        cl_command_queue_properties mergedProperties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast<cl_command_queue_properties>(properties);
        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, mergedProperties,
            CL_QUEUE_SIZE, queueSize, 
            0 };
        object_ = ::clCreateCommandQueueWithProperties(
            context(), device(), queue_properties, &error);

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }

    /*! \brief Constructor from cl_command_queue - takes ownership.
    *
    * \param retainObject will cause the constructor to retain its cl object.
    *                     Defaults to false to maintain compatibility with
    *                     earlier versions.
    */
    explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) :
        detail::Wrapper<cl_type>(commandQueue, retainObject) { }

    DeviceCommandQueue& operator = (const cl_command_queue& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    template <typename T>
    cl_int getInfo(cl_command_queue_info name, T* param) const
    {
        return detail::errHandler(
            detail::getInfo(
            &::clGetCommandQueueInfo, object_, name, param),
            __GET_COMMAND_QUEUE_INFO_ERR);
    }

    template <cl_command_queue_info name> typename
        detail::param_traits<detail::cl_command_queue_info, name>::param_type
        getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_command_queue_info, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    /*!
     * Create a new default device command queue for the default device,
     * in the default context and of the default size.
     * If there is already a default queue for the specified device this
     * function will return the pre-existing queue.
     */
    static DeviceCommandQueue makeDefault(
        cl_int *err = nullptr)
    {
        cl_int error;
        cl::Context context = cl::Context::getDefault();
        cl::Device device = cl::Device::getDefault();

        cl_command_queue_properties properties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, properties,
            0 };
        DeviceCommandQueue deviceQueue(
            ::clCreateCommandQueueWithProperties(
            context(), device(), queue_properties, &error));

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }

        return deviceQueue;
    }

    /*!
     * Create a new default device command queue for the specified device
     * and of the default size.
     * If there is already a default queue for the specified device this
     * function will return the pre-existing queue.
     */
    static DeviceCommandQueue makeDefault(
        const Context &context, const Device &device, cl_int *err = nullptr)
    {
        cl_int error;

        cl_command_queue_properties properties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, properties,
            0 };
        DeviceCommandQueue deviceQueue(
            ::clCreateCommandQueueWithProperties(
            context(), device(), queue_properties, &error));

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }

        return deviceQueue;
    }

    /*!
     * Create a new default device command queue for the specified device 
     * and of the requested size in bytes.
     * If there is already a default queue for the specified device this
     * function will return the pre-existing queue.
     */
    static DeviceCommandQueue makeDefault(
        const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr)
    {
        cl_int error;

        cl_command_queue_properties properties =
            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT;
        cl_queue_properties queue_properties[] = {
            CL_QUEUE_PROPERTIES, properties,
            CL_QUEUE_SIZE, queueSize,
            0 };
        DeviceCommandQueue deviceQueue(
            ::clCreateCommandQueueWithProperties(
                context(), device(), queue_properties, &error));

        detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR);
        if (err != nullptr) {
            *err = error;
        }

        return deviceQueue;
    }


#if CL_HPP_TARGET_OPENCL_VERSION >= 210
    /*!
     * Modify the default device command queue to be used for subsequent kernels.
     * This can update the default command queue for a device repeatedly to account
     * for kernels that rely on the default.
     * @return updated default device command queue.
     */
    static DeviceCommandQueue updateDefault(const Context &context, const Device &device, const DeviceCommandQueue &default_queue, cl_int *err = nullptr)
    {
        cl_int error;
        error = clSetDefaultDeviceCommandQueue(context.get(), device.get(), default_queue.get());

        detail::errHandler(error, __SET_DEFAULT_DEVICE_COMMAND_QUEUE_ERR);
        if (err != nullptr) {
            *err = error;
        }
        return default_queue;
    }

    /*!
     * Return the current default command queue for the specified command queue
     */
    static DeviceCommandQueue getDefault(const CommandQueue &queue, cl_int * err = nullptr)
    {
        return queue.getInfo<CL_QUEUE_DEVICE_DEFAULT>(err);
    }

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 210
}; // DeviceCommandQueue

namespace detail
{
    // Specialization for device command queue
    template <>
    struct KernelArgumentHandler<cl::DeviceCommandQueue, void>
    {
        static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); }
        static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); }
    };
} // namespace detail

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200


template< typename IteratorType >
Buffer::Buffer(
    const Context &context,
    IteratorType startIterator,
    IteratorType endIterator,
    bool readOnly,
    bool useHostPtr,
    cl_int* err)
{
    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
    cl_int error;

    cl_mem_flags flags = 0;
    if( readOnly ) {
        flags |= CL_MEM_READ_ONLY;
    }
    else {
        flags |= CL_MEM_READ_WRITE;
    }
    if( useHostPtr ) {
        flags |= CL_MEM_USE_HOST_PTR;
    }
    
    size_type size = sizeof(DataType)*(endIterator - startIterator);

    if( useHostPtr ) {
        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
    } else {
        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
    }

    detail::errHandler(error, __CREATE_BUFFER_ERR);
    if (err != nullptr) {
        *err = error;
    }

    if( !useHostPtr ) {
        CommandQueue queue(context, 0, &error);
        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }

        error = cl::copy(queue, startIterator, endIterator, *this);
        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
}

template< typename IteratorType >
Buffer::Buffer(
    const CommandQueue &queue,
    IteratorType startIterator,
    IteratorType endIterator,
    bool readOnly,
    bool useHostPtr,
    cl_int* err)
{
    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
    cl_int error;

    cl_mem_flags flags = 0;
    if (readOnly) {
        flags |= CL_MEM_READ_ONLY;
    }
    else {
        flags |= CL_MEM_READ_WRITE;
    }
    if (useHostPtr) {
        flags |= CL_MEM_USE_HOST_PTR;
    }

    size_type size = sizeof(DataType)*(endIterator - startIterator);

    Context context = queue.getInfo<CL_QUEUE_CONTEXT>();

    if (useHostPtr) {
        object_ = ::clCreateBuffer(context(), flags, size, const_cast<DataType*>(&*startIterator), &error);
    }
    else {
        object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
    }

    detail::errHandler(error, __CREATE_BUFFER_ERR);
    if (err != nullptr) {
        *err = error;
    }

    if (!useHostPtr) {
        error = cl::copy(queue, startIterator, endIterator, *this);
        detail::errHandler(error, __CREATE_BUFFER_ERR);
        if (err != nullptr) {
            *err = error;
        }
    }
}

inline cl_int enqueueReadBuffer(
    const Buffer& buffer,
    cl_bool blocking,
    size_type offset,
    size_type size,
    void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
}

inline cl_int enqueueWriteBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        size_type offset,
        size_type size,
        const void* ptr,
        const vector<Event>* events = nullptr,
        Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
}

inline void* enqueueMapBuffer(
        const Buffer& buffer,
        cl_bool blocking,
        cl_map_flags flags,
        size_type offset,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr,
        cl_int* err = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
    if (err != nullptr) {
        *err = error;
    }

    void * result = ::clEnqueueMapBuffer(
            queue(), buffer(), blocking, flags, offset, size,
            (events != nullptr) ? (cl_uint) events->size() : 0,
            (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
            (cl_event*) event,
            &error);

    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
    if (err != nullptr) {
        *err = error;
    }
    return result;
}


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
/**
 * Enqueues to the default queue a command that will allow the host to
 * update a region of a coarse-grained SVM buffer.
 * This variant takes a raw SVM pointer.
 */
template<typename T>
inline cl_int enqueueMapSVM(
    T* ptr,
    cl_bool blocking,
    cl_map_flags flags,
    size_type size,
    const vector<Event>* events,
    Event* event)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);
    }

    return queue.enqueueMapSVM(
        ptr, blocking, flags, size, events, event);
}

/**
 * Enqueues to the default queue a command that will allow the host to 
 * update a region of a coarse-grained SVM buffer.
 * This variant takes a cl::pointer instance.
 */
template<typename T, class D>
inline cl_int enqueueMapSVM(
    cl::pointer<T, D> &ptr,
    cl_bool blocking,
    cl_map_flags flags,
    size_type size,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
    }

    return queue.enqueueMapSVM(
        ptr, blocking, flags, size, events, event);
}

/**
 * Enqueues to the default queue a command that will allow the host to
 * update a region of a coarse-grained SVM buffer.
 * This variant takes a cl::vector instance.
 */
template<typename T, class Alloc>
inline cl_int enqueueMapSVM(
    cl::vector<T, Alloc> &container,
    cl_bool blocking,
    cl_map_flags flags,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_MAP_SVM_ERR);
    }

    return queue.enqueueMapSVM(
        container, blocking, flags, events, event);
}

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

inline cl_int enqueueUnmapMemObject(
    const Memory& memory,
    void* mapped_ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
    if (error != CL_SUCCESS) {
        return error;
    }

    cl_event tmp;
    cl_int err = detail::errHandler(
        ::clEnqueueUnmapMemObject(
        queue(), memory(), mapped_ptr,
        (events != nullptr) ? (cl_uint)events->size() : 0,
        (events != nullptr && events->size() > 0) ? (cl_event*)&events->front() : nullptr,
        (event != nullptr) ? &tmp : nullptr),
        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);

    if (event != nullptr && err == CL_SUCCESS)
        *event = tmp;

    return err;
}

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
/**
 * Enqueues to the default queue a command that will release a coarse-grained 
 * SVM buffer back to the OpenCL runtime.
 * This variant takes a raw SVM pointer.
 */
template<typename T>
inline cl_int enqueueUnmapSVM(
    T* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
    }

    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), 
        __ENQUEUE_UNMAP_SVM_ERR);

}

/**
 * Enqueues to the default queue a command that will release a coarse-grained 
 * SVM buffer back to the OpenCL runtime.
 * This variant takes a cl::pointer instance.
 */
template<typename T, class D>
inline cl_int enqueueUnmapSVM(
    cl::pointer<T, D> &ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
    }

    return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event),
        __ENQUEUE_UNMAP_SVM_ERR);
}

/**
 * Enqueues to the default queue a command that will release a coarse-grained 
 * SVM buffer back to the OpenCL runtime.
 * This variant takes a cl::vector instance.
 */
template<typename T, class Alloc>
inline cl_int enqueueUnmapSVM(
    cl::vector<T, Alloc> &container,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS) {
        return detail::errHandler(error, __ENQUEUE_UNMAP_SVM_ERR);
    }

    return detail::errHandler(queue.enqueueUnmapSVM(container, events, event),
        __ENQUEUE_UNMAP_SVM_ERR);
}

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

inline cl_int enqueueCopyBuffer(
        const Buffer& src,
        const Buffer& dst,
        size_type src_offset,
        size_type dst_offset,
        size_type size,
        const vector<Event>* events = nullptr,
        Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
}

/**
 * Blocking copy operation between iterators and a buffer.
 * Host to Device.
 * Uses default command queue.
 */
template< typename IteratorType >
inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS)
        return error;

    return cl::copy(queue, startIterator, endIterator, buffer);
}

/**
 * Blocking copy operation between iterators and a buffer.
 * Device to Host.
 * Uses default command queue.
 */
template< typename IteratorType >
inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);
    if (error != CL_SUCCESS)
        return error;

    return cl::copy(queue, buffer, startIterator, endIterator);
}

/**
 * Blocking copy operation between iterators and a buffer.
 * Host to Device.
 * Uses specified queue.
 */
template< typename IteratorType >
inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
{
    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
    cl_int error;
    
    size_type length = endIterator-startIterator;
    size_type byteLength = length*sizeof(DataType);

    DataType *pointer = 
        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
    // if exceptions enabled, enqueueMapBuffer will throw
    if( error != CL_SUCCESS ) {
        return error;
    }
#if defined(_MSC_VER) && _MSC_VER < 1920
    std::copy(
        startIterator,
        endIterator,
        stdext::checked_array_iterator<DataType*>(
            pointer, length));
#else
    std::copy(startIterator, endIterator, pointer);
#endif // defined(_MSC_VER) && _MSC_VER < 1920
    Event endEvent;
    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
    // if exceptions enabled, enqueueUnmapMemObject will throw
    if( error != CL_SUCCESS ) { 
        return error;
    }
    endEvent.wait();
    return CL_SUCCESS;
}

/**
 * Blocking copy operation between iterators and a buffer.
 * Device to Host.
 * Uses specified queue.
 */
template< typename IteratorType >
inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
{
    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
    cl_int error;
        
    size_type length = endIterator-startIterator;
    size_type byteLength = length*sizeof(DataType);

    DataType *pointer = 
        static_cast<DataType*>(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
    // if exceptions enabled, enqueueMapBuffer will throw
    if( error != CL_SUCCESS ) {
        return error;
    }
    std::copy(pointer, pointer + length, startIterator);
    Event endEvent;
    error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
    // if exceptions enabled, enqueueUnmapMemObject will throw
    if( error != CL_SUCCESS ) { 
        return error;
    }
    endEvent.wait();
    return CL_SUCCESS;
}


#if CL_HPP_TARGET_OPENCL_VERSION >= 200
/**
 * Blocking SVM map operation - performs a blocking map underneath.
 */
template<typename T, class Alloc>
inline cl_int mapSVM(cl::vector<T, Alloc> &container)
{
    return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE);
}

/**
* Blocking SVM map operation - performs a blocking map underneath.
*/
template<typename T, class Alloc>
inline cl_int unmapSVM(cl::vector<T, Alloc> &container)
{
    return enqueueUnmapSVM(container);
}

#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

#if CL_HPP_TARGET_OPENCL_VERSION >= 110
inline cl_int enqueueReadBufferRect(
    const Buffer& buffer,
    cl_bool blocking,
    const array<size_type, 3>& buffer_offset,
    const array<size_type, 3>& host_offset,
    const array<size_type, 3>& region,
    size_type buffer_row_pitch,
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
    void *ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueReadBufferRect(
        buffer, 
        blocking, 
        buffer_offset, 
        host_offset,
        region,
        buffer_row_pitch,
        buffer_slice_pitch,
        host_row_pitch,
        host_slice_pitch,
        ptr, 
        events, 
        event);
}

inline cl_int enqueueReadBufferRect(
    const Buffer& buffer, 
    cl_bool blocking,
    const array<size_type, 2>& buffer_offset,
    const array<size_type, 2>& host_offset, 
    const array<size_type, 2>& region,
    size_type buffer_row_pitch,
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
    void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueReadBufferRect(
        buffer,
        blocking,
        { buffer_offset[0], buffer_offset[1], 0 },
        { host_offset[0], host_offset[1], 0 },
        { region[0], region[1], 1 },
        buffer_row_pitch,
        buffer_slice_pitch,
        host_row_pitch,
        host_slice_pitch,
        ptr,
        events,
        event);
}

inline cl_int enqueueWriteBufferRect(
    const Buffer& buffer,
    cl_bool blocking,
    const array<size_type, 3>& buffer_offset,
    const array<size_type, 3>& host_offset,
    const array<size_type, 3>& region,
    size_type buffer_row_pitch,
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
    const void *ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueWriteBufferRect(
        buffer, 
        blocking, 
        buffer_offset, 
        host_offset,
        region,
        buffer_row_pitch,
        buffer_slice_pitch,
        host_row_pitch,
        host_slice_pitch,
        ptr, 
        events, 
        event);
}

inline cl_int enqueueWriteBufferRect(
    const Buffer& buffer,
    cl_bool blocking,
    const array<size_type, 2>& buffer_offset,
    const array<size_type, 2>& host_offset,
    const array<size_type, 2>& region,
    size_type buffer_row_pitch,
    size_type buffer_slice_pitch,
    size_type host_row_pitch,
    size_type host_slice_pitch,
    const void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueWriteBufferRect(
        buffer, 
        blocking,
        { buffer_offset[0], buffer_offset[1], 0 },
        { host_offset[0], host_offset[1], 0 },
        { region[0], region[1], 1 }, 
        buffer_row_pitch,
        buffer_slice_pitch,
        host_row_pitch,
        host_slice_pitch,
        ptr,
        events,
        event);
}

inline cl_int enqueueCopyBufferRect(
    const Buffer& src,
    const Buffer& dst,
    const array<size_type, 3>& src_origin,
    const array<size_type, 3>& dst_origin,
    const array<size_type, 3>& region,
    size_type src_row_pitch,
    size_type src_slice_pitch,
    size_type dst_row_pitch,
    size_type dst_slice_pitch,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueCopyBufferRect(
        src,
        dst,
        src_origin,
        dst_origin,
        region,
        src_row_pitch,
        src_slice_pitch,
        dst_row_pitch,
        dst_slice_pitch,
        events, 
        event);
}

inline cl_int enqueueCopyBufferRect(
    const Buffer& src,
    const Buffer& dst,
    const array<size_type, 2>& src_origin,
    const array<size_type, 2>& dst_origin,
    const array<size_type, 2>& region,
    size_type src_row_pitch,
    size_type src_slice_pitch,
    size_type dst_row_pitch,
    size_type dst_slice_pitch,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueCopyBufferRect(
        src,
        dst, 
        { src_origin[0], src_origin[1], 0 },
        { dst_origin[0], dst_origin[1], 0 },
        { region[0], region[1], 1 }, 
        src_row_pitch,
        src_slice_pitch,
        dst_row_pitch,
        dst_slice_pitch,
        events,
        event);
}
#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110

inline cl_int enqueueReadImage(
    const Image& image,
    cl_bool blocking,
    const array<size_type, 3>& origin,
    const array<size_type, 3>& region,
    size_type row_pitch,
    size_type slice_pitch,
    void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr) 
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueReadImage(
        image,
        blocking,
        origin,
        region,
        row_pitch,
        slice_pitch,
        ptr,
        events, 
        event);
}

inline cl_int enqueueReadImage(
    const Image& image, 
    cl_bool blocking,
    const array<size_type, 2>& origin,
    const array<size_type, 2>& region,
    size_type row_pitch,
    size_type slice_pitch,
    void* ptr, 
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueReadImage(
        image,
        blocking, 
        { origin[0], origin[1], 0 },
        { region[0], region[1], 1 },
        row_pitch,
        slice_pitch,
        ptr,
        events,
        event);
}

inline cl_int enqueueWriteImage(
    const Image& image,
    cl_bool blocking,
    const array<size_type, 3>& origin,
    const array<size_type, 3>& region,
    size_type row_pitch,
    size_type slice_pitch,
    const void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueWriteImage(
        image,
        blocking,
        origin,
        region,
        row_pitch,
        slice_pitch,
        ptr,
        events, 
        event);
}

inline cl_int enqueueWriteImage(
    const Image& image, 
    cl_bool blocking,
    const array<size_type, 2>& origin,
    const array<size_type, 2>& region,
    size_type row_pitch, 
    size_type slice_pitch,
    const void* ptr,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueWriteImage(
        image, 
        blocking, 
        { origin[0], origin[1], 0 },
        { region[0], region[1], 1 }, 
        row_pitch,
        slice_pitch,
        ptr,
        events,
        event);    
}

inline cl_int enqueueCopyImage(
    const Image& src,
    const Image& dst,
    const array<size_type, 3>& src_origin,
    const array<size_type, 3>& dst_origin,
    const array<size_type, 3>& region,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueCopyImage(
        src,
        dst,
        src_origin,
        dst_origin,
        region,
        events,
        event);
}

inline cl_int enqueueCopyImage(
    const Image& src, 
    const Image& dst,
    const array<size_type, 2>& src_origin,
    const array<size_type, 2>& dst_origin,
    const array<size_type, 2>& region,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueCopyImage(
        src, 
        dst,
        { src_origin[0], src_origin[1], 0 },
        { dst_origin[0], dst_origin[1], 0 },
        { region[0], region[1], 1 },
        events,
        event);
}

inline cl_int enqueueCopyImageToBuffer(
    const Image& src,
    const Buffer& dst,
    const array<size_type, 3>& src_origin,
    const array<size_type, 3>& region,
    size_type dst_offset,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueCopyImageToBuffer(
        src,
        dst,
        src_origin,
        region,
        dst_offset,
        events,
        event);
}

inline cl_int enqueueCopyImageToBuffer(
    const Image& src, 
    const Buffer& dst,
    const array<size_type, 2>& src_origin,
    const array<size_type, 2>& region,
    size_type dst_offset,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    return enqueueCopyImageToBuffer(
        src,
        dst,
        { src_origin[0], src_origin[1], 0 },
        { region[0], region[1], 1 },
        dst_offset,
        events,
        event);
}

inline cl_int enqueueCopyBufferToImage(
    const Buffer& src,
    const Image& dst,
    size_type src_offset,
    const array<size_type, 3>& dst_origin,
    const array<size_type, 3>& region,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.enqueueCopyBufferToImage(
        src,
        dst,
        src_offset,
        dst_origin,
        region,
        events,
        event);
}

inline cl_int enqueueCopyBufferToImage(
    const Buffer& src,
    const Image& dst,
    size_type src_offset,
    const array<size_type, 2>& dst_origin,
    const array<size_type, 2>& region,
    const vector<Event>* events = nullptr,
    Event* event = nullptr)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return enqueueCopyBufferToImage(
        src,
        dst,
        src_offset,
        { dst_origin[0], dst_origin[1], 0 },
        { region[0], region[1], 1 },
        events,
        event);
}

inline cl_int flush(void)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    }

    return queue.flush();
}

inline cl_int finish(void)
{
    cl_int error;
    CommandQueue queue = CommandQueue::getDefault(&error);

    if (error != CL_SUCCESS) {
        return error;
    } 


    return queue.finish();
}

class EnqueueArgs
{
private:
    CommandQueue queue_;
    const NDRange offset_;
    const NDRange global_;
    const NDRange local_;
    vector<Event> events_;

    template<typename... Ts>
    friend class KernelFunctor;

public:
    EnqueueArgs(NDRange global) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(NullRange)
    {

    }

    EnqueueArgs(NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(local)
    {

    }

    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(offset), 
      global_(global),
      local_(local)
    {

    }

    EnqueueArgs(Event e, NDRange global) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(NullRange)
    {
        events_.push_back(e);
    }

    EnqueueArgs(Event e, NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(local)
    {
        events_.push_back(e);
    }

    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(offset), 
      global_(global),
      local_(local)
    {
        events_.push_back(e);
    }

    EnqueueArgs(const vector<Event> &events, NDRange global) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(NullRange),
      events_(events)
    {

    }

    EnqueueArgs(const vector<Event> &events, NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(NullRange), 
      global_(global),
      local_(local),
      events_(events)
    {

    }

    EnqueueArgs(const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
      queue_(CommandQueue::getDefault()),
      offset_(offset), 
      global_(global),
      local_(local),
      events_(events)
    {

    }

    EnqueueArgs(CommandQueue &queue, NDRange global) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(NullRange)
    {

    }

    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(local)
    {

    }

    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(offset), 
      global_(global),
      local_(local)
    {

    }

    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(NullRange)
    {
        events_.push_back(e);
    }

    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(local)
    {
        events_.push_back(e);
    }

    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(offset), 
      global_(global),
      local_(local)
    {
        events_.push_back(e);
    }

    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(NullRange),
      events_(events)
    {

    }

    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(NullRange), 
      global_(global),
      local_(local),
      events_(events)
    {

    }

    EnqueueArgs(CommandQueue &queue, const vector<Event> &events, NDRange offset, NDRange global, NDRange local) : 
      queue_(queue),
      offset_(offset), 
      global_(global),
      local_(local),
      events_(events)
    {

    }
};


//----------------------------------------------------------------------------------------------


/**
 * Type safe kernel functor.
 * 
 */
template<typename... Ts>
class KernelFunctor
{
private:
    Kernel kernel_;

    template<int index, typename T0, typename... T1s>
    void setArgs(T0&& t0, T1s&&... t1s)
    {
        kernel_.setArg(index, t0);
        setArgs<index + 1, T1s...>(std::forward<T1s>(t1s)...);
    }

    template<int index, typename T0>
    void setArgs(T0&& t0)
    {
        kernel_.setArg(index, t0);
    }

    template<int index>
    void setArgs()
    {
    }


public:
    KernelFunctor(Kernel kernel) : kernel_(kernel)
    {}

    KernelFunctor(
        const Program& program,
        const string name,
        cl_int * err = nullptr) :
        kernel_(program, name.c_str(), err)
    {}

    //! \brief Return type of the functor
    typedef Event result_type;

    /**
     * Enqueue kernel.
     * @param args Launch parameters of the kernel.
     * @param t0... List of kernel arguments based on the template type of the functor.
     */
    Event operator() (
        const EnqueueArgs& args,
        Ts... ts)
    {
        Event event;
        setArgs<0>(std::forward<Ts>(ts)...);
        
        args.queue_.enqueueNDRangeKernel(
            kernel_,
            args.offset_,
            args.global_,
            args.local_,
            &args.events_,
            &event);

        return event;
    }

    /**
    * Enqueue kernel with support for error code.
    * @param args Launch parameters of the kernel.
    * @param t0... List of kernel arguments based on the template type of the functor.
    * @param error Out parameter returning the error code from the execution.
    */
    Event operator() (
        const EnqueueArgs& args,
        Ts... ts,
        cl_int &error)
    {
        Event event;
        setArgs<0>(std::forward<Ts>(ts)...);

        error = args.queue_.enqueueNDRangeKernel(
            kernel_,
            args.offset_,
            args.global_,
            args.local_,
            &args.events_,
            &event);
        
        return event;
    }

#if CL_HPP_TARGET_OPENCL_VERSION >= 200
    cl_int setSVMPointers(const vector<void*> &pointerList)
    {
        return kernel_.setSVMPointers(pointerList);
    }

    template<typename T0, typename... T1s>
    cl_int setSVMPointers(const T0 &t0, T1s &... ts)
    {
        return kernel_.setSVMPointers(t0, ts...);
    }
#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200

    Kernel getKernel()
    {
        return kernel_;
    }
};

namespace compatibility {
    /**
     * Backward compatibility class to ensure that cl.hpp code works with opencl.hpp.
     * Please use KernelFunctor directly.
     */
    template<typename... Ts>
    struct make_kernel
    {
        typedef KernelFunctor<Ts...> FunctorType;

        FunctorType functor_;

        make_kernel(
            const Program& program,
            const string name,
            cl_int * err = nullptr) :
            functor_(FunctorType(program, name, err))
        {}

        make_kernel(
            const Kernel kernel) :
            functor_(FunctorType(kernel))
        {}

        //! \brief Return type of the functor
        typedef Event result_type;

        //! \brief Function signature of kernel functor with no event dependency.
        typedef Event type_(
            const EnqueueArgs&,
            Ts...);

        Event operator()(
            const EnqueueArgs& enqueueArgs,
            Ts... args)
        {
            return functor_(
                enqueueArgs, args...);
        }
    };
} // namespace compatibility

#ifdef cl_khr_semaphore

#ifdef cl_khr_external_semaphore
enum ExternalSemaphoreType : cl_external_semaphore_handle_type_khr
{
    None = 0,
#ifdef cl_khr_external_semaphore_opaque_fd
    OpaqueFd = CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR,
#endif // cl_khr_external_semaphore_opaque_fd
#ifdef cl_khr_external_semaphore_sync_fd
    SyncFd = CL_SEMAPHORE_HANDLE_SYNC_FD_KHR,
#endif // cl_khr_external_semaphore_sync_fd
#ifdef cl_khr_external_semaphore_win32
    OpaqueWin32 = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR,
    OpaqueWin32Kmt = CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR,
#endif // cl_khr_external_semaphore_win32
};
#endif // cl_khr_external_semaphore

class Semaphore : public detail::Wrapper<cl_semaphore_khr>
{
public:
    Semaphore() : detail::Wrapper<cl_type>() {}
    Semaphore(
        const Context &context,
        const vector<cl_semaphore_properties_khr>& sema_props,
        cl_int *err = nullptr) 
    {
        /* initialization of addresses to extension functions (it is done only once) */
        std::call_once(ext_init_, initExtensions, context);

        cl_int error = CL_INVALID_OPERATION;

        if (pfn_clCreateSemaphoreWithPropertiesKHR)
        {
            object_ = pfn_clCreateSemaphoreWithPropertiesKHR(
                context(),
                sema_props.data(),
                &error);
        }
          
        detail::errHandler(error, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);

        if (err != nullptr) {
            *err = error;
        }
    }
    Semaphore(
        const vector<cl_semaphore_properties_khr>& sema_props,
        cl_int* err = nullptr):Semaphore(Context::getDefault(err), sema_props, err) {}
    
    explicit Semaphore(const cl_semaphore_khr& semaphore, bool retainObject = false) :
        detail::Wrapper<cl_type>(semaphore, retainObject) {}
    Semaphore& operator = (const cl_semaphore_khr& rhs) {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }
    template <typename T>
    cl_int getInfo(cl_semaphore_info_khr name, T* param) const
    {
        if (pfn_clGetSemaphoreInfoKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                                      __GET_SEMAPHORE_KHR_INFO_ERR);
        }

        return detail::errHandler(
            detail::getInfo(pfn_clGetSemaphoreInfoKHR, object_, name, param),
            __GET_SEMAPHORE_KHR_INFO_ERR);
    }
    template <cl_semaphore_info_khr name> typename
    detail::param_traits<detail::cl_semaphore_info_khr, name>::param_type
    getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_semaphore_info_khr, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;        
        }
        return param;      
    }

#ifdef cl_khr_external_semaphore
    template <typename T>
    cl_int getHandleForTypeKHR(
        const Device& device, cl_external_semaphore_handle_type_khr name, T* param) const
    {
        if (pfn_clGetSemaphoreHandleForTypeKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                                      __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);
        }

        return detail::errHandler(
            detail::getInfo(
                pfn_clGetSemaphoreHandleForTypeKHR, object_, device(), name, param),
                __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR);
    }

    template <cl_external_semaphore_handle_type_khr type> typename
    detail::param_traits<detail::cl_external_semaphore_handle_type_khr, type>::param_type
        getHandleForTypeKHR(const Device& device, cl_int* err = nullptr) const
    {
        typename detail::param_traits<
        detail::cl_external_semaphore_handle_type_khr, type>::param_type param;
        cl_int result = getHandleForTypeKHR(device, type, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
#endif // cl_khr_external_semaphore

    cl_int retain()
    { 
        if (pfn_clRetainSemaphoreKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                                      __RETAIN_SEMAPHORE_KHR_ERR);
        }
        return pfn_clRetainSemaphoreKHR(object_);
    }

    cl_int release()
    { 
        if (pfn_clReleaseSemaphoreKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                                      __RELEASE_SEMAPHORE_KHR_ERR);
        }
        return pfn_clReleaseSemaphoreKHR(object_);
    }

private:
    static std::once_flag ext_init_;

    static void initExtensions(const Context& context)
    {
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        Device device = context.getInfo<CL_CONTEXT_DEVICES>().at(0);
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateSemaphoreWithPropertiesKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseSemaphoreKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainSemaphoreKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueWaitSemaphoresKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueSignalSemaphoresKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreInfoKHR);
#ifdef cl_khr_external_semaphore
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetSemaphoreHandleForTypeKHR);
#endif // cl_khr_external_semaphore

#else
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSemaphoreWithPropertiesKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseSemaphoreKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainSemaphoreKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueWaitSemaphoresKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueSignalSemaphoresKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreInfoKHR);
#ifdef cl_khr_external_semaphore
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetSemaphoreHandleForTypeKHR);
#endif // cl_khr_external_semaphore

#endif
        if ((pfn_clCreateSemaphoreWithPropertiesKHR == nullptr) &&
            (pfn_clReleaseSemaphoreKHR              == nullptr) &&
            (pfn_clRetainSemaphoreKHR               == nullptr) &&
            (pfn_clEnqueueWaitSemaphoresKHR         == nullptr) &&
            (pfn_clEnqueueSignalSemaphoresKHR       == nullptr) &&
#ifdef cl_khr_external_semaphore
            (pfn_clGetSemaphoreHandleForTypeKHR     == nullptr) &&
#endif // cl_khr_external_semaphore
            (pfn_clGetSemaphoreInfoKHR              == nullptr))
        {
            detail::errHandler(CL_INVALID_VALUE, __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR);
        }
    }

};

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Semaphore::ext_init_;

inline cl_int CommandQueue::enqueueWaitSemaphores(
    const vector<Semaphore> &sema_objects,
    const vector<cl_semaphore_payload_khr> &sema_payloads,
    const vector<Event>* events_wait_list,
    Event *event) const
{
    cl_event tmp;
    cl_int err = CL_INVALID_OPERATION;

    if (pfn_clEnqueueWaitSemaphoresKHR != nullptr) {
        err = pfn_clEnqueueWaitSemaphoresKHR(
                object_,
                (cl_uint)sema_objects.size(),
                (const cl_semaphore_khr *) &sema_objects.front(),
                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr);
    }

    detail::errHandler(err, __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR);

    if (event != nullptr && err == CL_SUCCESS)
        *event = tmp;

    return err;
}

inline cl_int CommandQueue::enqueueSignalSemaphores(
    const vector<Semaphore> &sema_objects,
    const vector<cl_semaphore_payload_khr>& sema_payloads,
    const vector<Event>* events_wait_list,
    Event* event)
{
    cl_event tmp;
    cl_int err = CL_INVALID_OPERATION;

    if (pfn_clEnqueueSignalSemaphoresKHR != nullptr) {
        err = pfn_clEnqueueSignalSemaphoresKHR(
                object_,
                (cl_uint)sema_objects.size(),
                (const cl_semaphore_khr*) &sema_objects.front(),
                (sema_payloads.size() > 0) ? &sema_payloads.front() : nullptr,
                (events_wait_list != nullptr) ? (cl_uint) events_wait_list->size() : 0,
                (events_wait_list != nullptr && events_wait_list->size() > 0) ? (cl_event*) &events_wait_list->front() : nullptr,
                (event != nullptr) ? &tmp : nullptr);
    }

    detail::errHandler(err, __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR);

    if (event != nullptr && err == CL_SUCCESS)
        *event = tmp;

    return err;
}

#endif // cl_khr_semaphore

#if defined(cl_khr_command_buffer)
/*! \class CommandBufferKhr
 * \brief CommandBufferKhr interface for cl_command_buffer_khr.
 */
class CommandBufferKhr : public detail::Wrapper<cl_command_buffer_khr>
{
public:
    //! \brief Default constructor - initializes to nullptr.
    CommandBufferKhr() : detail::Wrapper<cl_type>() { }

    explicit CommandBufferKhr(const vector<CommandQueue> &queues,
        cl_command_buffer_properties_khr properties = 0,
        cl_int* errcode_ret = nullptr)
    {
        cl_command_buffer_properties_khr command_buffer_properties[] = {
            CL_COMMAND_BUFFER_FLAGS_KHR, properties, 0
        };

        /* initialization of addresses to extension functions (it is done only once) */
        std::call_once(ext_init_, [&] { initExtensions(queues[0].getInfo<CL_QUEUE_DEVICE>()); });
        cl_int error = CL_INVALID_OPERATION;

        static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
            "Size of cl::CommandQueue must be equal to size of cl_command_queue");

        if (pfn_clCreateCommandBufferKHR)
        {
            object_ = pfn_clCreateCommandBufferKHR((cl_uint) queues.size(),
                (cl_command_queue *) &queues.front(),
                command_buffer_properties,
                &error);
        }

        detail::errHandler(error, __CREATE_COMMAND_BUFFER_KHR_ERR);
        if (errcode_ret != nullptr) {
            *errcode_ret = error;
        }
    }

    explicit CommandBufferKhr(const cl_command_buffer_khr& commandBufferKhr, bool retainObject = false) :
        detail::Wrapper<cl_type>(commandBufferKhr, retainObject) { }

    CommandBufferKhr& operator=(const cl_command_buffer_khr& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    template <typename T>
    cl_int getInfo(cl_command_buffer_info_khr name, T* param) const
    {
        if (pfn_clGetCommandBufferInfoKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __GET_COMMAND_BUFFER_INFO_KHR_ERR);
        }
        return detail::errHandler(
            detail::getInfo(pfn_clGetCommandBufferInfoKHR, object_, name, param),
                __GET_COMMAND_BUFFER_INFO_KHR_ERR);
    }

    template <cl_command_buffer_info_khr name> typename
        detail::param_traits<detail::cl_command_buffer_info_khr, name>::param_type
        getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_command_buffer_info_khr, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }

    cl_int finalizeCommandBuffer() const
    {
        return detail::errHandler(::clFinalizeCommandBufferKHR(object_), __FINALIZE_COMMAND_BUFFER_KHR_ERR);
    }

    cl_int enqueueCommandBuffer(vector<CommandQueue> &queues,
        const vector<Event>* events = nullptr,
        Event* event = nullptr)
    {
        if (pfn_clEnqueueCommandBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
        }

         static_assert(sizeof(cl::CommandQueue) == sizeof(cl_command_queue),
            "Size of cl::CommandQueue must be equal to size of cl_command_queue");

        return detail::errHandler(pfn_clEnqueueCommandBufferKHR((cl_uint) queues.size(),
                (cl_command_queue *) &queues.front(),
                object_,
                (events != nullptr) ? (cl_uint) events->size() : 0,
                (events != nullptr && events->size() > 0) ? (cl_event*) &events->front() : nullptr,
                (cl_event*) event),
                __ENQUEUE_COMMAND_BUFFER_KHR_ERR);
    }

    cl_int commandBarrierWithWaitList(const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandBarrierWithWaitListKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandBarrierWithWaitListKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandCopyBuffer(const Buffer& src,
        const Buffer& dst,
        size_type src_offset,
        size_type dst_offset,
        size_type size,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandCopyBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_COPY_BUFFER_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandCopyBufferKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                src(),
                dst(),
                src_offset,
                dst_offset,
                size,
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_COPY_BUFFER_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandCopyBufferRect(const Buffer& src,
        const Buffer& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        size_type src_row_pitch,
        size_type src_slice_pitch,
        size_type dst_row_pitch,
        size_type dst_slice_pitch,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandCopyBufferRectKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_COPY_BUFFER_RECT_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandCopyBufferRectKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                src(),
                dst(),
                src_origin.data(),
                dst_origin.data(),
                region.data(),
                src_row_pitch,
                src_slice_pitch,
                dst_row_pitch,
                dst_slice_pitch,
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_COPY_BUFFER_RECT_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandCopyBufferToImage(const Buffer& src,
        const Image& dst,
        size_type src_offset,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandCopyBufferToImageKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandCopyBufferToImageKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                src(),
                dst(),
                src_offset,
                dst_origin.data(),
                region.data(),
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandCopyImage(const Image& src,
        const Image& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& dst_origin,
        const array<size_type, 3>& region,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandCopyImageKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_COPY_IMAGE_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandCopyImageKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                src(),
                dst(),
                src_origin.data(),
                dst_origin.data(),
                region.data(),
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_COPY_IMAGE_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandCopyImageToBuffer(const Image& src,
        const Buffer& dst,
        const array<size_type, 3>& src_origin,
        const array<size_type, 3>& region,
        size_type dst_offset,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandCopyImageToBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandCopyImageToBufferKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                src(),
                dst(),
                src_origin.data(),
                region.data(),
                dst_offset,
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    template<typename PatternType>
    cl_int commandFillBuffer(const Buffer& buffer,
        PatternType pattern,
        size_type offset,
        size_type size,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandFillBufferKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_FILL_BUFFER_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandFillBufferKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                buffer(),
                static_cast<void*>(&pattern),
                sizeof(PatternType),
                offset,
                size,
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_FILL_BUFFER_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandFillImage(const Image& image,
        cl_float4 fillColor,
        const array<size_type, 3>& origin,
        const array<size_type, 3>& region,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandFillImageKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_FILL_IMAGE_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandFillImageKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
                nullptr, // Properties
#endif
                image(),
                static_cast<void*>(&fillColor),
                origin.data(),
                region.data(),
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_FILL_IMAGE_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

    cl_int commandNDRangeKernel(
#if CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION > CL_MAKE_VERSION(0, 9, 4)
            const cl::vector<cl_command_properties_khr> &properties,
#else
            const cl::vector<cl_ndrange_kernel_command_properties_khr> &properties,
#endif
        const Kernel& kernel,
        const NDRange& offset,
        const NDRange& global,
        const NDRange& local = NullRange,
        const vector<cl_sync_point_khr>* sync_points_vec = nullptr,
        cl_sync_point_khr* sync_point = nullptr,
        MutableCommandKhr* mutable_handle = nullptr,
        const CommandQueue* command_queue = nullptr)
    {
        if (pfn_clCommandNDRangeKernelKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __COMMAND_NDRANGE_KERNEL_KHR_ERR);
        }

        cl_sync_point_khr tmp_sync_point;
        cl_int error = detail::errHandler(
            pfn_clCommandNDRangeKernelKHR(object_,
                (command_queue != nullptr) ? (*command_queue)() : nullptr,
                &properties[0],
                kernel(),
                (cl_uint) global.dimensions(),
                offset.dimensions() != 0 ? (const size_type*) offset : nullptr,
                (const size_type*) global,
                local.dimensions() != 0 ? (const size_type*) local : nullptr,
                (sync_points_vec != nullptr) ? (cl_uint) sync_points_vec->size() : 0,
                (sync_points_vec != nullptr && sync_points_vec->size() > 0) ? &sync_points_vec->front() : nullptr,
                (sync_point != nullptr) ? &tmp_sync_point : nullptr,
                (cl_mutable_command_khr*) mutable_handle),
            __COMMAND_NDRANGE_KERNEL_KHR_ERR);

        if (sync_point != nullptr && error == CL_SUCCESS)
            *sync_point = tmp_sync_point;

        return error;
    }

#if defined(cl_khr_command_buffer_mutable_dispatch)
#if CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION <                 \
    CL_MAKE_VERSION(0, 9, 2)
    cl_int updateMutableCommands(const cl_mutable_base_config_khr* mutable_config)
    {
        if (pfn_clUpdateMutableCommandsKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
        }
        return detail::errHandler(pfn_clUpdateMutableCommandsKHR(object_, mutable_config),
                        __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
    }
#else
    template <int ArrayLength>
    cl_int updateMutableCommands(std::array<cl_command_buffer_update_type_khr,
                                            ArrayLength> &config_types,
                                 std::array<const void *, ArrayLength> &configs) {
        if (pfn_clUpdateMutableCommandsKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                                      __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
        }
        return detail::errHandler(
            pfn_clUpdateMutableCommandsKHR(object_, static_cast<cl_uint>(configs.size()),
                                           config_types.data(), configs.data()),
            __UPDATE_MUTABLE_COMMANDS_KHR_ERR);
    }
#endif /* CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION */
#endif /* cl_khr_command_buffer_mutable_dispatch */

private:
    static std::once_flag ext_init_;

    static void initExtensions(const cl::Device& device)
    {
#if CL_HPP_TARGET_OPENCL_VERSION >= 120
        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>()();
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clFinalizeCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clRetainCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clReleaseCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetCommandBufferInfoKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandBarrierWithWaitListKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferRectKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyBufferToImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandCopyImageToBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandFillImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCommandNDRangeKernelKHR);
#if defined(cl_khr_command_buffer_mutable_dispatch)
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clUpdateMutableCommandsKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clGetMutableCommandInfoKHR);
#endif /* cl_khr_command_buffer_mutable_dispatch */
#elif CL_HPP_TARGET_OPENCL_VERSION >= 110
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clFinalizeCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clRetainCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clReleaseCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetCommandBufferInfoKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueCommandBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandBarrierWithWaitListKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferRectKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyBufferToImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandCopyImageToBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillBufferKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandFillImageKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clCommandNDRangeKernelKHR);
#if defined(cl_khr_command_buffer_mutable_dispatch)
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clUpdateMutableCommandsKHR);
        CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetMutableCommandInfoKHR);
#endif /* cl_khr_command_buffer_mutable_dispatch */
#endif
        if ((pfn_clCreateCommandBufferKHR        == nullptr) &&
            (pfn_clFinalizeCommandBufferKHR      == nullptr) &&
            (pfn_clRetainCommandBufferKHR        == nullptr) &&
            (pfn_clReleaseCommandBufferKHR       == nullptr) &&
            (pfn_clGetCommandBufferInfoKHR       == nullptr) &&
            (pfn_clEnqueueCommandBufferKHR       == nullptr) &&
            (pfn_clCommandBarrierWithWaitListKHR == nullptr) &&
            (pfn_clCommandCopyBufferKHR          == nullptr) &&
            (pfn_clCommandCopyBufferRectKHR      == nullptr) &&
            (pfn_clCommandCopyBufferToImageKHR   == nullptr) &&
            (pfn_clCommandCopyImageKHR           == nullptr) &&
            (pfn_clCommandCopyImageToBufferKHR   == nullptr) &&
            (pfn_clCommandFillBufferKHR          == nullptr) &&
            (pfn_clCommandFillImageKHR           == nullptr) &&
            (pfn_clCommandNDRangeKernelKHR       == nullptr)
#if defined(cl_khr_command_buffer_mutable_dispatch)
            && (pfn_clUpdateMutableCommandsKHR      == nullptr)
            && (pfn_clGetMutableCommandInfoKHR      == nullptr)
#endif /* cl_khr_command_buffer_mutable_dispatch */
            )
        {
            detail::errHandler(CL_INVALID_VALUE, __CREATE_COMMAND_BUFFER_KHR_ERR);
        }
    }
}; // CommandBufferKhr

CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandBufferKhr::ext_init_;

#if defined(cl_khr_command_buffer_mutable_dispatch)
/*! \class MutableCommandKhr
 * \brief MutableCommandKhr interface for cl_mutable_command_khr.
 */
class MutableCommandKhr : public detail::Wrapper<cl_mutable_command_khr>
{
public:
    //! \brief Default constructor - initializes to nullptr.
    MutableCommandKhr() : detail::Wrapper<cl_type>() { }

    explicit MutableCommandKhr(const cl_mutable_command_khr& mutableCommandKhr, bool retainObject = false) :
        detail::Wrapper<cl_type>(mutableCommandKhr, retainObject) { }

    MutableCommandKhr& operator=(const cl_mutable_command_khr& rhs)
    {
        detail::Wrapper<cl_type>::operator=(rhs);
        return *this;
    }

    template <typename T>
    cl_int getInfo(cl_mutable_command_info_khr name, T* param) const
    {
        if (pfn_clGetMutableCommandInfoKHR == nullptr) {
            return detail::errHandler(CL_INVALID_OPERATION,
                    __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
        }
        return detail::errHandler(
            detail::getInfo(pfn_clGetMutableCommandInfoKHR, object_, name, param),
                __GET_MUTABLE_COMMAND_INFO_KHR_ERR);
    }

    template <cl_mutable_command_info_khr name> typename
        detail::param_traits<detail::cl_mutable_command_info_khr, name>::param_type
        getInfo(cl_int* err = nullptr) const
    {
        typename detail::param_traits<
            detail::cl_mutable_command_info_khr, name>::param_type param;
        cl_int result = getInfo(name, &param);
        if (err != nullptr) {
            *err = result;
        }
        return param;
    }
}; // MutableCommandKhr
#endif /* cl_khr_command_buffer_mutable_dispatch */

#endif // cl_khr_command_buffer
//----------------------------------------------------------------------------------------------------------------------

#undef CL_HPP_ERR_STR_
#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS)
#undef __GET_DEVICE_INFO_ERR               
#undef __GET_PLATFORM_INFO_ERR             
#undef __GET_DEVICE_IDS_ERR                
#undef __GET_PLATFORM_IDS_ERR              
#undef __GET_CONTEXT_INFO_ERR              
#undef __GET_EVENT_INFO_ERR                
#undef __GET_EVENT_PROFILE_INFO_ERR        
#undef __GET_MEM_OBJECT_INFO_ERR           
#undef __GET_IMAGE_INFO_ERR                
#undef __GET_SAMPLER_INFO_ERR              
#undef __GET_KERNEL_INFO_ERR               
#undef __GET_KERNEL_ARG_INFO_ERR           
#undef __GET_KERNEL_SUB_GROUP_INFO_ERR     
#undef __GET_KERNEL_WORK_GROUP_INFO_ERR    
#undef __GET_PROGRAM_INFO_ERR              
#undef __GET_PROGRAM_BUILD_INFO_ERR        
#undef __GET_COMMAND_QUEUE_INFO_ERR        
#undef __CREATE_CONTEXT_ERR                
#undef __CREATE_CONTEXT_FROM_TYPE_ERR
#undef __CREATE_COMMAND_BUFFER_KHR_ERR
#undef __GET_COMMAND_BUFFER_INFO_KHR_ERR
#undef __FINALIZE_COMMAND_BUFFER_KHR_ERR
#undef __ENQUEUE_COMMAND_BUFFER_KHR_ERR
#undef __COMMAND_BARRIER_WITH_WAIT_LIST_KHR_ERR
#undef __COMMAND_COPY_BUFFER_KHR_ERR
#undef __COMMAND_COPY_BUFFER_RECT_KHR_ERR
#undef __COMMAND_COPY_BUFFER_TO_IMAGE_KHR_ERR
#undef __COMMAND_COPY_IMAGE_KHR_ERR
#undef __COMMAND_COPY_IMAGE_TO_BUFFER_KHR_ERR
#undef __COMMAND_FILL_BUFFER_KHR_ERR
#undef __COMMAND_FILL_IMAGE_KHR_ERR
#undef __COMMAND_NDRANGE_KERNEL_KHR_ERR
#undef __UPDATE_MUTABLE_COMMANDS_KHR_ERR
#undef __GET_MUTABLE_COMMAND_INFO_KHR_ERR
#undef __RETAIN_COMMAND_BUFFER_KHR_ERR
#undef __RELEASE_COMMAND_BUFFER_KHR_ERR
#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR   
#undef __SET_CONTEXT_DESCTRUCTOR_CALLBACK_ERR
#undef __CREATE_BUFFER_ERR                 
#undef __COPY_ERR                          
#undef __CREATE_SUBBUFFER_ERR              
#undef __CREATE_GL_BUFFER_ERR              
#undef __CREATE_GL_RENDER_BUFFER_ERR       
#undef __GET_GL_OBJECT_INFO_ERR            
#undef __CREATE_IMAGE_ERR                  
#undef __CREATE_GL_TEXTURE_ERR             
#undef __IMAGE_DIMENSION_ERR               
#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR 
#undef __CREATE_USER_EVENT_ERR             
#undef __SET_USER_EVENT_STATUS_ERR         
#undef __SET_EVENT_CALLBACK_ERR            
#undef __WAIT_FOR_EVENTS_ERR               
#undef __CREATE_KERNEL_ERR                 
#undef __SET_KERNEL_ARGS_ERR               
#undef __CREATE_PROGRAM_WITH_SOURCE_ERR    
#undef __CREATE_PROGRAM_WITH_BINARY_ERR    
#undef __CREATE_PROGRAM_WITH_IL_ERR        
#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    
#undef __BUILD_PROGRAM_ERR                 
#undef __COMPILE_PROGRAM_ERR               
#undef __LINK_PROGRAM_ERR                  
#undef __CREATE_KERNELS_IN_PROGRAM_ERR     
#undef __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR          
#undef __CREATE_SAMPLER_WITH_PROPERTIES_ERR                
#undef __SET_COMMAND_QUEUE_PROPERTY_ERR    
#undef __ENQUEUE_READ_BUFFER_ERR           
#undef __ENQUEUE_READ_BUFFER_RECT_ERR      
#undef __ENQUEUE_WRITE_BUFFER_ERR          
#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR     
#undef __ENQEUE_COPY_BUFFER_ERR            
#undef __ENQEUE_COPY_BUFFER_RECT_ERR       
#undef __ENQUEUE_FILL_BUFFER_ERR           
#undef __ENQUEUE_READ_IMAGE_ERR            
#undef __ENQUEUE_WRITE_IMAGE_ERR           
#undef __ENQUEUE_COPY_IMAGE_ERR            
#undef __ENQUEUE_FILL_IMAGE_ERR            
#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  
#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  
#undef __ENQUEUE_MAP_BUFFER_ERR
#undef __ENQUEUE_MAP_IMAGE_ERR
#undef __ENQUEUE_MAP_SVM_ERR
#undef __ENQUEUE_FILL_SVM_ERR
#undef __ENQUEUE_COPY_SVM_ERR
#undef __ENQUEUE_UNMAP_SVM_ERR              
#undef __ENQUEUE_MAP_IMAGE_ERR             
#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR      
#undef __ENQUEUE_NDRANGE_KERNEL_ERR        
#undef __ENQUEUE_NATIVE_KERNEL             
#undef __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   
#undef __ENQUEUE_MIGRATE_SVM_ERR
#undef __ENQUEUE_ACQUIRE_GL_ERR            
#undef __ENQUEUE_RELEASE_GL_ERR            
#undef __CREATE_PIPE_ERR             
#undef __GET_PIPE_INFO_ERR           
#undef __RETAIN_ERR                        
#undef __RELEASE_ERR                       
#undef __FLUSH_ERR                         
#undef __FINISH_ERR                        
#undef __VECTOR_CAPACITY_ERR               
#undef __CREATE_SUB_DEVICES_ERR
#undef __ENQUEUE_ACQUIRE_EXTERNAL_MEMORY_ERR
#undef __ENQUEUE_RELEASE_EXTERNAL_MEMORY_ERR
#undef __ENQUEUE_MARKER_ERR                
#undef __ENQUEUE_WAIT_FOR_EVENTS_ERR       
#undef __ENQUEUE_BARRIER_ERR               
#undef __UNLOAD_COMPILER_ERR               
#undef __CREATE_GL_TEXTURE_2D_ERR          
#undef __CREATE_GL_TEXTURE_3D_ERR          
#undef __CREATE_IMAGE2D_ERR                
#undef __CREATE_IMAGE3D_ERR                
#undef __CREATE_COMMAND_QUEUE_ERR          
#undef __ENQUEUE_TASK_ERR                  
#undef __CREATE_SAMPLER_ERR                
#undef __ENQUEUE_MARKER_WAIT_LIST_ERR                
#undef __ENQUEUE_BARRIER_WAIT_LIST_ERR               
#undef __CLONE_KERNEL_ERR     
#undef __GET_HOST_TIMER_ERR
#undef __GET_DEVICE_AND_HOST_TIMER_ERR
#undef __GET_SEMAPHORE_KHR_INFO_ERR
#undef __CREATE_SEMAPHORE_KHR_WITH_PROPERTIES_ERR
#undef __GET_IMAGE_REQUIREMENT_INFO_EXT_ERR
#undef __ENQUEUE_WAIT_SEMAPHORE_KHR_ERR
#undef __ENQUEUE_SIGNAL_SEMAPHORE_KHR_ERR
#undef __RETAIN_SEMAPHORE_KHR_ERR
#undef __RELEASE_SEMAPHORE_KHR_ERR
#undef __GET_SEMAPHORE_HANDLE_FOR_TYPE_KHR_ERR

#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS

// Extensions
#undef CL_HPP_CREATE_CL_EXT_FCN_PTR_ALIAS_
#undef CL_HPP_INIT_CL_EXT_FCN_PTR_
#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_

#undef CL_HPP_DEFINE_STATIC_MEMBER_

} // namespace cl

#endif // CL_HPP_


================================================
FILE: svm/OpenCL/lib/pkgconfig/OpenCL.pc
================================================
prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
exec_prefix=${prefix}
libdir=${exec_prefix}/lib

Name: OpenCL
Description: Khronos OpenCL ICD Loader
Requires: OpenCL-Headers
Version: 3.0
Libs: -L${libdir} -lOpenCL


================================================
FILE: svm/OpenCL/share/cmake/OpenCL/OpenCLConfig.cmake
================================================
get_filename_component(PARENT_DIR ${CMAKE_CURRENT_LIST_DIR} PATH)
include("${PARENT_DIR}/OpenCLHeaders/OpenCLHeadersConfig.cmake")
include("${PARENT_DIR}/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake")
include("${PARENT_DIR}/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake")
include("${PARENT_DIR}/OpenCLUtils/OpenCLUtilsConfig.cmake")
include("${PARENT_DIR}/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake")
  

================================================
FILE: svm/OpenCL/share/cmake/OpenCL/OpenCLConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "2024.10.24")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "1.0.220515")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-debug.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Debug".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::OpenCLExt" for configuration "Debug"
set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
set_target_properties(OpenCL::OpenCLExt PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "CXX"
  IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLExt.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )
list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets-release.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Release".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::OpenCLExt" for configuration "Release"
set_property(TARGET OpenCL::OpenCLExt APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(OpenCL::OpenCLExt PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX"
  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLExt.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::OpenCLExt )
list(APPEND _cmake_import_check_files_for_OpenCL::OpenCLExt "${_IMPORT_PREFIX}/lib/OpenCLExt.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLExtensionLoader/OpenCLExtensionLoaderTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "2.8.12")
   message(FATAL_ERROR "CMake >= 2.8.12 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 2.8.12...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::OpenCLExt)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::OpenCLExt
add_library(OpenCL::OpenCLExt STATIC IMPORTED)

set_target_properties(OpenCL::OpenCLExt PROPERTIES
  INTERFACE_LINK_LIBRARIES "\$<LINK_ONLY:OpenCL::OpenCL>"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLExtensionLoaderTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# Make sure the targets which have been exported in some other
# export set exist.
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
foreach(_target "OpenCL::OpenCL" )
  if(NOT TARGET "${_target}" )
    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
  endif()
endforeach()

if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
  if(CMAKE_FIND_PACKAGE_NAME)
    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  else()
    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  endif()
endif()
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "3.0")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeaders/OpenCLHeadersTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "3.0.0")
   message(FATAL_ERROR "CMake >= 3.0.0 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 3.0.0...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::Headers)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::Headers
add_library(OpenCL::Headers INTERFACE IMPORTED)

set_target_properties(OpenCL::Headers PROPERTIES
  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# This file does not depend on other imported targets which have
# been exported from the same project but in a separate export set.

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "3.0")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLHeadersCpp/OpenCLHeadersCppTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "3.0.0")
   message(FATAL_ERROR "CMake >= 3.0.0 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 3.0.0...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::HeadersCpp)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::HeadersCpp
add_library(OpenCL::HeadersCpp INTERFACE IMPORTED)

set_target_properties(OpenCL::HeadersCpp PROPERTIES
  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
  INTERFACE_LINK_LIBRARIES "OpenCL::Headers"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLHeadersCppTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# Make sure the targets which have been exported in some other
# export set exist.
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
foreach(_target "OpenCL::Headers" )
  if(NOT TARGET "${_target}" )
    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
  endif()
endforeach()

if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
  if(CMAKE_FIND_PACKAGE_NAME)
    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  else()
    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  endif()
endif()
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "3.0")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-debug.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Debug".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::OpenCL" for configuration "Debug"
set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
set_target_properties(OpenCL::OpenCL PROPERTIES
  IMPORTED_IMPLIB_DEBUG "${_IMPORT_PREFIX}/lib/OpenCL.lib"
  IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/bin/OpenCL.dll"
  )

list(APPEND _cmake_import_check_targets OpenCL::OpenCL )
list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets-release.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Release".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::OpenCL" for configuration "Release"
set_property(TARGET OpenCL::OpenCL APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(OpenCL::OpenCL PROPERTIES
  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/OpenCL.lib"
  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/OpenCL.dll"
  )

list(APPEND _cmake_import_check_targets OpenCL::OpenCL )
list(APPEND _cmake_import_check_files_for_OpenCL::OpenCL "${_IMPORT_PREFIX}/lib/OpenCL.lib" "${_IMPORT_PREFIX}/bin/OpenCL.dll" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLICDLoader/OpenCLICDLoaderTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "2.8.12")
   message(FATAL_ERROR "CMake >= 2.8.12 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 2.8.12...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::OpenCL)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::OpenCL
add_library(OpenCL::OpenCL SHARED IMPORTED)

set_target_properties(OpenCL::OpenCL PROPERTIES
  INTERFACE_LINK_LIBRARIES "OpenCL::Headers"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLICDLoaderTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# Make sure the targets which have been exported in some other
# export set exist.
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
foreach(_target "OpenCL::Headers" )
  if(NOT TARGET "${_target}" )
    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
  endif()
endforeach()

if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
  if(CMAKE_FIND_PACKAGE_NAME)
    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  else()
    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  endif()
endif()
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "2024.10.24")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-debug.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Debug".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::Utils" for configuration "Debug"
set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
set_target_properties(OpenCL::Utils PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C"
  IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::Utils )
list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtilsd.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets-release.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Release".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::Utils" for configuration "Release"
set_property(TARGET OpenCL::Utils APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(OpenCL::Utils PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C"
  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::Utils )
list(APPEND _cmake_import_check_files_for_OpenCL::Utils "${_IMPORT_PREFIX}/lib/OpenCLUtils.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtils/OpenCLUtilsTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "2.8.12")
   message(FATAL_ERROR "CMake >= 2.8.12 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 2.8.12...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::Utils)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::Utils
add_library(OpenCL::Utils STATIC IMPORTED)

set_target_properties(OpenCL::Utils PROPERTIES
  INTERFACE_COMPILE_DEFINITIONS "CL_HPP_ENABLE_EXCEPTIONS"
  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
  INTERFACE_LINK_LIBRARIES "\$<LINK_ONLY:whereami>;OpenCL::Headers;\$<\$<BOOL:>:m>;OpenCL::OpenCL"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# Make sure the targets which have been exported in some other
# export set exist.
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
foreach(_target "OpenCL::Headers" "OpenCL::OpenCL" )
  if(NOT TARGET "${_target}" )
    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
  endif()
endforeach()

if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
  if(CMAKE_FIND_PACKAGE_NAME)
    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  else()
    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  endif()
endif()
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfig.cmake
================================================
include("${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets.cmake")

================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppConfigVersion.cmake
================================================
# This is a basic version file for the Config-mode of find_package().
# It is used by write_basic_package_version_file() as input file for configure_file()
# to create a version-file which can be installed along a config.cmake file.
#
# The created file sets PACKAGE_VERSION_EXACT if the current version string and
# the requested version string are exactly the same and it sets
# PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version.
# The variable CVF_VERSION must be set before calling configure_file().

set(PACKAGE_VERSION "2024.10.24")

if (PACKAGE_FIND_VERSION_RANGE)
  # Package version must be in the requested version range
  if ((PACKAGE_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION_MIN)
      OR ((PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE" AND PACKAGE_VERSION VERSION_GREATER PACKAGE_FIND_VERSION_MAX)
        OR (PACKAGE_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE" AND PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MAX)))
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
  endif()
else()
  if(PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
    set(PACKAGE_VERSION_COMPATIBLE FALSE)
  else()
    set(PACKAGE_VERSION_COMPATIBLE TRUE)
    if(PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
      set(PACKAGE_VERSION_EXACT TRUE)
    endif()
  endif()
endif()


# if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it:
if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "" STREQUAL "")
  return()
endif()

# check that the installed version has the same 32/64bit-ness as the one which is currently searching:
if(NOT CMAKE_SIZEOF_VOID_P STREQUAL "")
  math(EXPR installedBits " * 8")
  set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)")
  set(PACKAGE_VERSION_UNSUITABLE TRUE)
endif()


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-debug.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Debug".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::UtilsCpp" for configuration "Debug"
set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG)
set_target_properties(OpenCL::UtilsCpp PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_DEBUG "C;CXX"
  IMPORTED_LOCATION_DEBUG "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )
list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCppd.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets-release.cmake
================================================
#----------------------------------------------------------------
# Generated CMake target import file for configuration "Release".
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Import target "OpenCL::UtilsCpp" for configuration "Release"
set_property(TARGET OpenCL::UtilsCpp APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
set_target_properties(OpenCL::UtilsCpp PROPERTIES
  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "C;CXX"
  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib"
  )

list(APPEND _cmake_import_check_targets OpenCL::UtilsCpp )
list(APPEND _cmake_import_check_files_for_OpenCL::UtilsCpp "${_IMPORT_PREFIX}/lib/OpenCLUtilsCpp.lib" )

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)


================================================
FILE: svm/OpenCL/share/cmake/OpenCLUtilsCpp/OpenCLUtilsCppTargets.cmake
================================================
# Generated by CMake

if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.8)
   message(FATAL_ERROR "CMake >= 2.8.0 required")
endif()
if(CMAKE_VERSION VERSION_LESS "2.8.12")
   message(FATAL_ERROR "CMake >= 2.8.12 required")
endif()
cmake_policy(PUSH)
cmake_policy(VERSION 2.8.12...3.28)
#----------------------------------------------------------------
# Generated CMake target import file.
#----------------------------------------------------------------

# Commands may need to know the format version.
set(CMAKE_IMPORT_FILE_VERSION 1)

# Protect against multiple inclusion, which would fail when already imported targets are added once more.
set(_cmake_targets_defined "")
set(_cmake_targets_not_defined "")
set(_cmake_expected_targets "")
foreach(_cmake_expected_target IN ITEMS OpenCL::UtilsCpp)
  list(APPEND _cmake_expected_targets "${_cmake_expected_target}")
  if(TARGET "${_cmake_expected_target}")
    list(APPEND _cmake_targets_defined "${_cmake_expected_target}")
  else()
    list(APPEND _cmake_targets_not_defined "${_cmake_expected_target}")
  endif()
endforeach()
unset(_cmake_expected_target)
if(_cmake_targets_defined STREQUAL _cmake_expected_targets)
  unset(_cmake_targets_defined)
  unset(_cmake_targets_not_defined)
  unset(_cmake_expected_targets)
  unset(CMAKE_IMPORT_FILE_VERSION)
  cmake_policy(POP)
  return()
endif()
if(NOT _cmake_targets_defined STREQUAL "")
  string(REPLACE ";" ", " _cmake_targets_defined_text "${_cmake_targets_defined}")
  string(REPLACE ";" ", " _cmake_targets_not_defined_text "${_cmake_targets_not_defined}")
  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_cmake_targets_defined_text}\nTargets not yet defined: ${_cmake_targets_not_defined_text}\n")
endif()
unset(_cmake_targets_defined)
unset(_cmake_targets_not_defined)
unset(_cmake_expected_targets)


# Compute the installation prefix relative to this file.
get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
if(_IMPORT_PREFIX STREQUAL "/")
  set(_IMPORT_PREFIX "")
endif()

# Create imported target OpenCL::UtilsCpp
add_library(OpenCL::UtilsCpp STATIC IMPORTED)

set_target_properties(OpenCL::UtilsCpp PROPERTIES
  INTERFACE_COMPILE_DEFINITIONS "CL_HPP_ENABLE_EXCEPTIONS"
  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
  INTERFACE_LINK_LIBRARIES "\$<LINK_ONLY:whereami>;OpenCL::HeadersCpp;OpenCL::Utils;OpenCL::OpenCL"
)

# Load information for each installed configuration.
file(GLOB _cmake_config_files "${CMAKE_CURRENT_LIST_DIR}/OpenCLUtilsCppTargets-*.cmake")
foreach(_cmake_config_file IN LISTS _cmake_config_files)
  include("${_cmake_config_file}")
endforeach()
unset(_cmake_config_file)
unset(_cmake_config_files)

# Cleanup temporary variables.
set(_IMPORT_PREFIX)

# Loop over all imported files and verify that they actually exist
foreach(_cmake_target IN LISTS _cmake_import_check_targets)
  if(CMAKE_VERSION VERSION_LESS "3.28"
      OR NOT DEFINED _cmake_import_check_xcframework_for_${_cmake_target}
      OR NOT IS_DIRECTORY "${_cmake_import_check_xcframework_for_${_cmake_target}}")
    foreach(_cmake_file IN LISTS "_cmake_import_check_files_for_${_cmake_target}")
      if(NOT EXISTS "${_cmake_file}")
        message(FATAL_ERROR "The imported target \"${_cmake_target}\" references the file
   \"${_cmake_file}\"
but this file does not exist.  Possible reasons include:
* The file was deleted, renamed, or moved to another location.
* An install or uninstall procedure did not complete successfully.
* The installation package was faulty and contained
   \"${CMAKE_CURRENT_LIST_FILE}\"
but not all the files it references.
")
      endif()
    endforeach()
  endif()
  unset(_cmake_file)
  unset("_cmake_import_check_files_for_${_cmake_target}")
endforeach()
unset(_cmake_target)
unset(_cmake_import_check_targets)

# Make sure the targets which have been exported in some other
# export set exist.
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
foreach(_target "OpenCL::HeadersCpp" "OpenCL::Utils" "OpenCL::OpenCL" )
  if(NOT TARGET "${_target}" )
    set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets "${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets} ${_target}")
  endif()
endforeach()

if(DEFINED ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)
  if(CMAKE_FIND_PACKAGE_NAME)
    set( ${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE)
    set( ${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  else()
    message(FATAL_ERROR "The following imported targets are referenced, but are missing: ${${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets}")
  endif()
endif()
unset(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE_targets)

# Commands beyond this point should not need to know the version.
set(CMAKE_IMPORT_FILE_VERSION)
cmake_policy(POP)


================================================
FILE: svm/OpenCL/share/pkgconfig/OpenCL-CLHPP.pc
================================================
prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
includedir=${prefix}/include

Name: OpenCL-CLHPP
Description: OpenCL API C++ bindings
Requires: OpenCL-Headers
Version: 3.0
Cflags: -I${includedir}


================================================
FILE: svm/OpenCL/share/pkgconfig/OpenCL-Headers.pc
================================================
prefix=D:/a/OpenCL-SDK/OpenCL-SDK/install
includedir=${prefix}/include

Name: OpenCL-Headers
Description: Khronos OpenCL Headers
Version: 3.0
Cflags: -I${includedir}


================================================
FILE: svm/atomic_latency_kernel.cl
================================================
__kernel void atomic_exec_latency_test(__global int* A, int count) {
    int current = 1;
    while (current <= 2 * count) {
        if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
            current += 2;
            // printf("gpu current = %d\n", current);
        } // else printf("A = %d wait for %d\n", *A, current - 1);
    }
}

__kernel void increment_on_gpu(__global int *A)
{
    *A = *A + 1;
}

================================================
FILE: svm/svm.sln
================================================
﻿
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.12.35527.113 d17.12
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "svm", "svm.vcxproj", "{411AB5E4-FD55-4478-83F2-80C51F205FA7}"
EndProject
Global
	GlobalSection(SolutionConfigurationPlatforms) = preSolution
		Debug|x64 = Debug|x64
		Debug|x86 = Debug|x86
		Release|x64 = Release|x64
		Release|x86 = Release|x86
	EndGlobalSection
	GlobalSection(ProjectConfigurationPlatforms) = postSolution
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.ActiveCfg = Debug|x64
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x64.Build.0 = Debug|x64
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.ActiveCfg = Debug|Win32
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Debug|x86.Build.0 = Debug|Win32
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.ActiveCfg = Release|x64
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x64.Build.0 = Release|x64
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.ActiveCfg = Release|Win32
		{411AB5E4-FD55-4478-83F2-80C51F205FA7}.Release|x86.Build.0 = Release|Win32
	EndGlobalSection
	GlobalSection(SolutionProperties) = preSolution
		HideSolutionNode = FALSE
	EndGlobalSection
EndGlobal


================================================
FILE: svm/svm.vcxproj
================================================
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
      <Configuration>Debug</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|Win32">
      <Configuration>Release</Configuration>
      <Platform>Win32</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <VCProjectVersion>17.0</VCProjectVersion>
    <Keyword>Win32Proj</Keyword>
    <ProjectGuid>{411ab5e4-fd55-4478-83f2-80c51f205fa7}</ProjectGuid>
    <RootNamespace>svm</RootNamespace>
    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
    <PlatformToolset>v143</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Label="Shared">
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level3</WarningLevel>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <SDLCheck>true</SDLCheck>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>OpenCL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="..\common\timing.h" />
    <ClCompile Include="svmtest.cpp" />
  </ItemGroup>
  <ItemGroup>
    <CopyFileToFolders Include="atomic_latency_kernel.cl">
      <FileType>Document</FileType>
    </CopyFileToFolders>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
</Project>

================================================
FILE: svm/svm.vcxproj.filters
================================================
﻿<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <Filter Include="Source Files">
      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
      <Extensions>cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
    </Filter>
    <Filter Include="Header Files">
      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
      <Extensions>h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd</Extensions>
    </Filter>
    <Filter Include="Resource Files">
      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="svmtest.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
</Project>

================================================
FILE: svm/svmtest.cpp
================================================
#define CL_TARGET_OPENCL_VERSION 300

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#include "../Common/timing.h"
#include "../Common/timing.c" // lol avoids trying to link
#include <CL/cl.h>

#ifdef _MSC_VER
#include <Windows.h>
#else
#include <sched.h>
#include <pthread.h>
#define _strnicmp strncmp
#endif

#define TARGET_TIME_MS 2000

cl_device_id selected_device_id;
cl_platform_id selected_platform_id;

int checkSVMSupport(cl_device_svm_capabilities desiredCaps);
cl_context get_context_from_user(int platform_index, int device_index);
cl_program build_program(cl_context context, const char* fname, const char* params);
#ifdef _MSC_VER
DWORD WINAPI LatencyTestThread(LPVOID param);
#else
void* LatencyTestThread(void* param);
#endif

float runAtomicsTest(cl_context context, cl_command_queue command_queue);
float runBufferSharingTest(cl_context context, cl_command_queue command_queue);

typedef struct LatencyThreadData {
    uint64_t start;       // initial value to write into target
    uint64_t iterations;  // number of iterations to run
    uint32_t* target;       // value to bounce between threads, init with start - 1
} LatencyData;


#define ALLOC_SIZE 4096

// 4K alignment doesn't work
#define ALLOC_ALIGN 64

enum TestType {
    Atomics,
    BufferSharing,
    Custom
};

int main(int argc, char* argv[])
{
    cl_int ret;
    cl_context context = NULL;
    cl_command_queue command_queue = NULL;
    TestType testType = Atomics;
    int platform_index = -1, device_index = -1;

    for (int argIdx = 1; argIdx < argc; argIdx++) {
        if (*(argv[argIdx]) == '-') {
            char* arg = argv[argIdx] + 1;
            if (_strnicmp(arg, "atomics", 7) == 0) {
                argIdx++;
                testType = Atomics;
                fprintf(stderr, "Test type = atomics\n");
            }
            else if (_strnicmp(arg, "buffersharing", 13) == 0)
            {
                argIdx++;
                testType = BufferSharing;
                fprintf(stderr, "Test type = buffer sharing\n");
            }
            else if (_strnicmp(arg, "custom", 6) == 0)
            {
                argIdx++;
                testType = Custom;
                fprintf(stderr, "Test type = custom code\n");
            }
            else if (_strnicmp(arg, "platform", 8) == 0) {
                argIdx++;
                platform_index = atoi(argv[argIdx]);
                fprintf(stderr, "Using OpenCL platform index %d\n", platform_index);
            }
            else if (_strnicmp(arg, "device", 6) == 0) {
                argIdx++;
                device_index = atoi(argv[argIdx]);
                fprintf(stderr, "Using OpenCL device index %d\n", device_index);
            }
        }
    }

    if (testType != Custom)
    {
        context = get_context_from_user(platform_index, device_index);
        command_queue = clCreateCommandQueueWithProperties(context, selected_device_id, NULL, &ret);
    }

    if (testType == Atomics)
    {
        runAtomicsTest(context, command_queue);
    }
    else if (testType == BufferSharing)
    {
        runBufferSharingTest(context, command_queue);
    }
    else
    {
#ifdef _MSC_VER
        float results[4];
        for (int i = 0; i < 4; i++)
        {
            context = get_context_from_user(1, 0);
            command_queue = clCreateCommandQueueWithProperties(context, selected_device_id, NULL, &ret);
            SetProcessAffinityMask(GetCurrentProcess(), 1UL << i);
            results[i] = runAtomicsTest(context, command_queue);
            clReleaseCommandQueue(command_queue);
            clReleaseContext(context);
        }

        printf("CPU,GPU\n");
        for (int i = 0; i < 4; i++)
        {
            printf("%d,%f\n", i, results[i]);
        }
#endif
    }

end:
    if (testType != Custom) {
        clReleaseCommandQueue(command_queue);
        clReleaseContext(context);
    }
    return 0;
}

float runBufferSharingTest(cl_context context, cl_command_queue command_queue)
{
    cl_int ret;
    cl_event evt, buffer_evt;
    size_t gpu_threads = 1;
    uint64_t time_diff_ms;
    float latency;
    uint32_t* testptr, current = 2, iterations = 1000;
    cl_program program = build_program(context, "atomic_latency_kernel.cl", NULL);
    cl_kernel increment_kernel = clCreateKernel(program, "increment_on_gpu", &ret);
    int fineGrainedSupport = checkSVMSupport(CL_DEVICE_SVM_FINE_GRAIN_BUFFER);
    if (fineGrainedSupport) fprintf(stderr, "Device has SVM fine grained buffer support\n");
    else fprintf(stderr, "Device can only use coarse grained buffer sharing\n");
    testptr = (uint32_t*)clSVMAlloc(context, CL_MEM_READ_WRITE | (fineGrainedSupport ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0), ALLOC_SIZE, ALLOC_ALIGN);
    
    // test setup
    do {
        clSetKernelArgSVMPointer(increment_kernel, 0, testptr);
        *testptr = 0;
        current = 2;
        start_timing();
        for (int i = 0; i < iterations; i++)
        {
            if (!fineGrainedSupport)
            {
                clEnqueueSVMUnmap(command_queue, testptr, 0, NULL, NULL);
                clFinish(command_queue);
            }
            

            ret = clEnqueueNDRangeKernel(command_queue, increment_kernel, 1, NULL, &gpu_threads, &gpu_threads, 0, NULL, &evt);
            if (ret != CL_SUCCESS)
            {
                fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
                latency = 0;
                goto bufferend;
            }

            clWaitForEvents(1, &evt);
            if (!fineGrainedSupport) {
                clEnqueueSVMMap(command_queue, CL_NON_BLOCKING, CL_MAP_READ | CL_MAP_WRITE, testptr, ALLOC_SIZE, 0, NULL, NULL);
                //clWaitForEvents(1, &buffer_evt);
                clFinish(command_queue);
            }

            if (*testptr == current - 1)
            {
                *testptr = current;
                current += 2;
            }
            else
            {
                fprintf(stderr, "Buffer sharing did not work. Expected %d, test value is still %d\n", current - 1, *testptr);
                goto bufferend;
            }
        }
        time_diff_ms = end_timing();
        latency = (1e6 * (float)time_diff_ms / (float)(iterations));
        printf("Latency: %f ns, %lu ms elapsed time, %u iterations\n", latency, time_diff_ms, iterations);
        iterations = scale_iterations_to_target(iterations, time_diff_ms, TARGET_TIME_MS);

    } while (time_diff_ms < TARGET_TIME_MS / 2);

bufferend:
    clSVMFree(context, testptr);
    clReleaseKernel(increment_kernel);
    clReleaseProgram(program);
    return latency;
}

float runAtomicsTest(cl_context context, cl_command_queue command_queue)
{
    cl_int ret;
    LatencyData latencyData;
    float latency;
    size_t gpu_threads = 1;
    uint64_t time_diff_ms;
    uint32_t* testptr, iterations = 1000000;
    cl_program program;
    cl_kernel atomic_kernel;

    int svmSupport = checkSVMSupport(CL_DEVICE_SVM_ATOMICS);
    if (svmSupport) fprintf(stderr, "Device has SVM support\n");
    else
    {
        fprintf(stderr, "SVM atomics are not supported on selected device. Exiting.\n");
        return 0.0f;
    }

    program = build_program(context, "atomic_latency_kernel.cl", NULL);
    atomic_kernel = clCreateKernel(program, "atomic_exec_latency_test", &ret);
    testptr = (uint32_t*)clSVMAlloc(context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS, ALLOC_SIZE, ALLOC_ALIGN);
    if (testptr == NULL)
    {
        fprintf(stderr, "Failed to get memory via clSVMAlloc\n");
        goto atomicsend;
    }

    clFinish(command_queue);
    clSetKernelArgSVMPointer(atomic_kernel, 0, testptr);

    do {
        clSetKernelArg(atomic_kernel, 1, sizeof(cl_int), (void*)&iterations);
        latencyData.iterations = iterations;
        latencyData.start = 2; // GPU thread start = 1
        latencyData.target = testptr;
        *testptr = 0;

#ifdef _MSC_VER
        HANDLE testThread;
        DWORD testThreadId;
        testThread = CreateThread(NULL, 0, LatencyTestThread, &latencyData, CREATE_SUSPENDED, &testThreadId);
#else
        pthread_t testThread;
        pthread_create(&testThread, NULL, LatencyTestThread, (void*)&latencyData);
#endif

        start_timing();
#ifdef _MSC_VER
        ResumeThread(testThread);
#else
#endif

        // Blocking call, must come after ResumeThread
        ret = clEnqueueNDRangeKernel(command_queue, atomic_kernel, 1, NULL, &gpu_threads, &gpu_threads, 0, NULL, NULL);
        if (ret != CL_SUCCESS)
        {
            fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
            latency = 0;
            goto atomicsend;
        }
        clFinish(command_queue);
#ifdef _MSC_VER
        WaitForSingleObject(testThread, INFINITE);
#else
        pthread_join(testThread, NULL);
#endif
        time_diff_ms = end_timing();
        latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;
        printf("Latency: %f ns, %lu ms elapsed time, %u iterations\n", latency, time_diff_ms, iterations);
        iterations = scale_iterations_to_target(iterations, time_diff_ms, TARGET_TIME_MS);
    } while (time_diff_ms < TARGET_TIME_MS / 2);

atomicsend:
    clSVMFree(context, testptr);
    clReleaseKernel(atomic_kernel);
    clReleaseProgram(program);
    return latency;
}

/// <summary>
/// Runs one thread of the latency test. should be run in pairs
/// Always writes to target
/// </summary>
/// <param name="param">Latency test params</param>
/// <returns>next value that would have been written to shared memory</returns>
#ifdef _MSC_VER
DWORD WINAPI LatencyTestThread(LPVOID param) {
#else
void* LatencyTestThread(void* param) {
#endif
    LatencyData* latencyData = (LatencyData*)param;
    uint64_t current = latencyData->start;
    while (current <= 2 * latencyData->iterations) {
#ifdef _MSC_VER
        if (_InterlockedCompareExchange(latencyData->target, current, current - 1) == current - 1) {
#else
        if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) {
#endif
            current += 2;
            // fprintf(stderr, "CPU current = %d\n", current);
        }
        // else fprintf(stderr, "target = %d waiting for %d\n", latencyData->target, current - 1);
    }

#ifdef _MSC_VER
    return current;
#else
    pthread_exit(NULL);
#endif
    }


int checkSVMSupport(cl_device_svm_capabilities desiredCaps)
{
    cl_device_svm_capabilities caps;
    cl_int ret = clGetDeviceInfo(selected_device_id, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &caps, 0);
    if (ret != CL_SUCCESS)
    {
        fprintf(stderr, "Failed to check for SVM support (%d)\n", ret);
        return 0;
    }

    if (caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) fprintf(stderr, "Device supports coarse grained buffer sharing\n");
    if (caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) fprintf(stderr, "Device supports fine grained buffer sharing\n");
    if (caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) fprintf(stderr, "Device supports sharing virtual memory allocated on host\n");
    if (caps & CL_DEVICE_SVM_ATOMICS) fprintf(stderr, "Device supports atomic operations on shared memory\n");

    return caps & desiredCaps;
}

#define MAX_SOURCE_SIZE (0x100000)
cl_program build_program(cl_context context, const char* fname, const char* params)
{
    cl_int ret;
    FILE* fp = NULL;
    char* source_str;
    size_t source_size;
    fp = fopen(fname, "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel %s.\n", fname);
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose(fp);

    cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);
    ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL);
    //fprintf(stderr, "clBuildProgram %s returned %d\n", fname, ret);
    if (ret == -11)
    {
        size_t log_size;
        fprintf(stderr, "OpenCL kernel build error\n");
        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        char* log = (char*)malloc(log_size);
        clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        fprintf(stderr, "%s\n", log);
        free(log);
    }

    free(source_str);
    return program;
}

/// <summary>
/// populate global variables for opencl device id and platform id
/// </summary>
/// <param name="platform_index">platform index. if -1, prompt user</param>
/// <param name="device_index">device index. if -1. prompt user</param>
/// <returns>opencl context</returns>
cl_context get_context_from_user(int platform_index, int device_index) {
    int i = 0;
    int selected_platform_index = 0, selected_device_index = 0;

    // Get platform and device information
    cl_uint ret_num_devices;
    cl_uint ret_num_platforms;

    cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
    cl_platform_id* platforms = NULL;
    cl_device_id* devices = NULL;
    cl_context context = NULL;
    platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));

    ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
    fprintf(stderr, "clGetPlatformIDs returned %d. %d platforms\n", ret, ret_num_platforms);

    for (i = 0; i < ret_num_platforms; i++)
    {
        size_t platform_name_len;
        char* platform_name = NULL;
        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) {
            fprintf(stderr, "Failed to get platform info for platform %d\n", i);
            continue;
        }

        platform_name = (char*)malloc(platform_name_len + 1);
        platform_name[platform_name_len] = 0;

        if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) {
            fprintf(stderr, "Failed to get platform name for platform %d\n", i);
            free(platform_name);
            continue;
        }

        fprintf(stderr, "Platform %d: %s\n", i, platform_name);
        free(platform_name);
    }

    selected_platform_index = platform_index;
    if (selected_platform_index == -1)
    {
        printf("Enter platform #:");
        scanf("%d", &selected_platform_index);
    }

    if (selected_platform_index > ret_num_platforms - 1)
    {
        fprintf(stderr, "platform index out of range\n");
        goto get_context_from_user_end;
    }

    selected_platform_id = platforms[selected_platform_index];

    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) {
        fprintf(stderr, "Failed to enumerate device ids for platform");
        return NULL;
    }

    devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));
    if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) {
        fprintf(stderr, "Failed to get device ids for platform");
        free(devices);
        return NULL;
    }

    fprintf(stderr, "clGetDeviceIDs returned %d devices\n", ret_num_devices);

    for (i = 0; i < ret_num_devices; i++)
    {
        size_t device_name_len;
        char* device_name = NULL;
        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) {
            fprintf(stderr, "Failed to get name length for device %d\n", i);
            continue;
        }

        //fprintf(stderr, "debug: device name length: %d\n", device_name_len);
        device_name = (char*)malloc(device_name_len + 1);
        device_name[device_name_len] = 0;

        if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) {
            fprintf(stderr, "Failed to get name for device %d\n", i);
            free(device_name);
            continue;
        }

        fprintf(stderr, "Device %d: %s\n", i, device_name);
        free(device_name);
    }

    selected_device_index = device_index;
    if (selected_device_index == -1)
    {
        fprintf(stderr, "Enter device #:");
        scanf("%d", &selected_device_index);
    }


    if (selected_device_index > ret_num_devices - 1)
    {
        fprintf(stderr, "Device index out of range\n");
        goto get_context_from_user_end;
    }

    selected_device_id = devices[selected_device_index];

    // Create an OpenCL context
    context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret);
    fprintf(stderr, "clCreateContext returned %d\n", ret);

get_context_from_user_end:
    free(platforms);
    free(devices);
    return context;
}