Repository: clamchowder/Microbenchmarks
Branch: master
Commit: 13159d44086d
Files: 315
Total size: 3.0 MB
Directory structure:
gitextract_nqotrkr3/
├── .github/
│ └── workflows/
│ └── linux.yaml
├── .gitignore
├── AsmGen/
│ ├── AsmGen.csproj
│ ├── AsmGen.sln
│ ├── DataFiles/
│ │ ├── BranchhistTestBlock.c
│ │ ├── CommonFunctions.c
│ │ ├── GccBranchHistFunction.c
│ │ ├── GccIndirectBranchFunction.c
│ │ ├── IndirectBranchTestBlock.c
│ │ └── clammicrobench.vcxproj_template
│ ├── IUarchTest.cs
│ ├── Program.cs
│ ├── Properties/
│ │ └── launchSettings.json
│ ├── README.md
│ ├── UarchTest.cs
│ ├── UarchTestHelpers.cs
│ └── tests/
│ ├── A73RobTest.cs
│ ├── AddLoopTest.cs
│ ├── AddNsq.cs
│ ├── AddSchedTest.cs
│ ├── AddvNsq.cs
│ ├── AddvSchedTest.cs
│ ├── AeseSchedTest.cs
│ ├── AesencNsq.cs
│ ├── BranchBufferTest.cs
│ ├── BranchHistoryTest.cs
│ ├── BtbTest.cs
│ ├── CvtSchedTest.cs
│ ├── FAdd256RfTest.cs
│ ├── Fadd128RfTest.cs
│ ├── Fadd128SchedTest.cs
│ ├── Fadd256SchedTest.cs
│ ├── FaddNsq.cs
│ ├── FaddSchedTest.cs
│ ├── FcmpSchedTest.cs
│ ├── FlagRfTest.cs
│ ├── Fma256SchedTest.cs
│ ├── FmovSched.cs
│ ├── FmulSchedTest.cs
│ ├── FpRfTest.cs
│ ├── FpStoreDataNsq.cs
│ ├── IdrfTest.cs
│ ├── IndirectBranchTest.cs
│ ├── IntRfDepStoreTest.cs
│ ├── IntRfTest.cs
│ ├── JsCvtNsq.cs
│ ├── JsCvtSched.cs
│ ├── JumpNsqTest.cs
│ ├── JumpSchedTest.cs
│ ├── LdqTest.cs
│ ├── LeaSchedTest.cs
│ ├── LoadNsq.cs
│ ├── LoadSchedTest.cs
│ ├── MaddSchedTest.cs
│ ├── MaskRfTest.cs
│ ├── MixAddJumpSched.cs
│ ├── MixAddvJsCvtNsq.cs
│ ├── MixAddvJsCvtSched.cs
│ ├── MixBranchStoreTest.cs
│ ├── MixFAdd256and32RfTest.cs
│ ├── MixFpRfDepBranchTest.cs
│ ├── MixFpVecRfTest.cs
│ ├── MixIntRfDepBranchTest.cs
│ ├── MixIntVec128RfTest.cs
│ ├── MixIntrfFprfTest.cs
│ ├── MixJumpStoreDataSched.cs
│ ├── MixJumpStoreSchedTest.cs
│ ├── MixJumpThenAddSched.cs
│ ├── MixLdqStqTest.cs
│ ├── MixLoadStoreDivSchedTest.cs
│ ├── MixLoadStoreSchedTest.cs
│ ├── MixStoreDivSchedTest.cs
│ ├── MixVec512Vec256BlockRfTest.cs
│ ├── MixVec512Vec256RfTest.cs
│ ├── MmxRfTest.cs
│ ├── MulSchedTest.cs
│ ├── NopLoopTest.cs
│ ├── PdepSchedTest.cs
│ ├── ReturnStackTest.cs
│ ├── RobTest.cs
│ ├── RorSchedTest.cs
│ ├── ShlSchedTest.cs
│ ├── StoreDataDivNsqTest.cs
│ ├── StoreDataNsqTest.cs
│ ├── StoreDataSchedTest.cs
│ ├── StoreDivNsqTest.cs
│ ├── StoreDivSchedTest.cs
│ ├── StoreNsq.cs
│ ├── StoreSchedTest.cs
│ ├── Stq128Test.cs
│ ├── Stq512Test.cs
│ ├── StqTest.cs
│ ├── TakenBranchBufferTest.cs
│ ├── TakenJumpSchedTest.cs
│ ├── Vec512RfTest.cs
│ ├── VecMulNsq.cs
│ └── ZeroRobTest.cs
├── CoherencyLatency/
│ ├── CoherencyLatency.cpp
│ ├── CoherencyLatency.sln
│ ├── CoherencyLatency.vcxproj
│ ├── Makefile
│ ├── PThreadsCoherencyLatency.c
│ └── c2cparse/
│ ├── Program.cs
│ ├── c2cparse.csproj
│ └── c2cparse.sln
├── Common/
│ ├── arch_detect.mk
│ ├── ci_gpumemlatency.sh
│ ├── ci_package.sh
│ ├── perfmon.h
│ ├── timing.c
│ └── timing.h
├── CoreClockChecker/
│ ├── BoostClockChecker.c
│ ├── BoostClockChecker_arm.s
│ ├── BoostClockChecker_x86.s
│ ├── CoreClockChecker.c
│ ├── CoreClockChecker_x86.s
│ ├── Makefile
│ └── WinCoreClockChecker/
│ ├── CoreClockCheckFunctions.asm
│ ├── WinCoreClockChecker.cpp
│ ├── WinCoreClockChecker.sln
│ ├── WinCoreClockChecker.vcxproj
│ └── WinCoreClockChecker.vcxproj.filters
├── GpuMemLatency/
│ ├── Makefile
│ ├── OpenCL/
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── include/
│ │ │ └── CL/
│ │ │ ├── cl.h
│ │ │ ├── cl_d3d10.h
│ │ │ ├── cl_d3d11.h
│ │ │ ├── cl_dx9_media_sharing.h
│ │ │ ├── cl_dx9_media_sharing_intel.h
│ │ │ ├── cl_egl.h
│ │ │ ├── cl_ext.h
│ │ │ ├── cl_ext_intel.h
│ │ │ ├── cl_gl.h
│ │ │ ├── cl_gl_ext.h
│ │ │ ├── cl_half.h
│ │ │ ├── cl_icd.h
│ │ │ ├── cl_platform.h
│ │ │ ├── cl_va_api_media_sharing_intel.h
│ │ │ ├── cl_version.h
│ │ │ └── opencl.h
│ │ └── lib/
│ │ └── OpenCL.lib
│ ├── atomic_test.c
│ ├── bw_test.c
│ ├── common.c
│ ├── instruction_rate.c
│ ├── instruction_rate_fp16_kernel.cl
│ ├── instruction_rate_fp64_kernel.cl
│ ├── instruction_rate_kernel.cl
│ ├── kernel.cl
│ ├── kernels/
│ │ ├── atomic_exec_latency_test.cl
│ │ ├── buffer_bw_test.cl
│ │ ├── c2c_atomic_exec_latency_test.cl
│ │ ├── constant_unrolled_latency_test.cl
│ │ ├── ldst_bw_test.cl
│ │ ├── local_64_bw_test.cl
│ │ ├── local_atomic_latency_test.cl
│ │ ├── local_bw_test.cl
│ │ ├── local_float4_bw_test.cl
│ │ ├── local_unrolled_latency_test.cl
│ │ ├── scalar_unrolled_latency_test.cl
│ │ ├── sum_bw_test.cl
│ │ ├── tex_bw_test.cl
│ │ ├── tex_latency_test.cl
│ │ └── unrolled_latency_test.cl
│ ├── latency_test.c
│ ├── local_mem_latency_kernel.cl
│ ├── opencltest.c
│ ├── opencltest.h
│ ├── opencltest.sln
│ ├── opencltest.vcxproj
│ ├── opencltest.vcxproj.filters
│ └── texturetest.c
├── InstructionRate/
│ ├── Makefile
│ ├── arm_instructionrate.c
│ ├── arm_instructionrate.s
│ ├── riscv_instructionrate.c
│ ├── riscv_instructionrate.s
│ ├── test.s
│ ├── x86_fusion.c
│ ├── x86_fusion.s
│ ├── x86_instructionrate.c
│ └── x86_instructionrate.s
├── LICENSE
├── LoadedMemoryLatency/
│ ├── LoadedMemoryLatency/
│ │ ├── LoadedMemoryLatency.asm
│ │ ├── LoadedMemoryLatency.cpp
│ │ ├── LoadedMemoryLatency.sln
│ │ ├── LoadedMemoryLatency.vcxproj
│ │ └── LoadedMemoryLatency.vcxproj.filters
│ ├── LoadedMemoryLatency.c
│ ├── LoadedMemoryLatency_amd64.s
│ ├── LoadedMemoryLatency_arm.s
│ └── Makefile
├── Makefile
├── MemoryBandwidth/
│ ├── Makefile
│ ├── MemoryBandwidth/
│ │ ├── MemoryBandwidth.cpp
│ │ ├── MemoryBandwidth.sln
│ │ ├── MemoryBandwidth.vcxproj
│ │ ├── MemoryBandwidth.vcxproj.filters
│ │ ├── MemoryBandwidthFunctions.asm
│ │ └── MemoryBandwidthFunctions32.asm
│ ├── MemoryBandwidth.c
│ ├── MemoryBandwidth_arm.s
│ ├── MemoryBandwidth_riscv.s
│ ├── MemoryBandwidth_x86.s
│ ├── MixedMemoryBandwidthTest/
│ │ ├── MemoryBandwidth.h
│ │ ├── MemoryBandwidthFunctions.asm
│ │ ├── MixedMemoryBandwidthTest.cpp
│ │ ├── MixedMemoryBandwidthTest.vcxproj
│ │ └── MixedMemoryBandwidthTest.vcxproj.filters
│ └── README.md
├── MemoryLatency/
│ ├── Makefile
│ ├── MemoryLatency.c
│ ├── MemoryLatency.cpp
│ ├── MemoryLatency.sln
│ ├── MemoryLatency.vcxproj
│ ├── MemoryLatencyFunctions.asm
│ ├── MemoryLatency_arm.s
│ ├── MemoryLatency_i686.s
│ ├── MemoryLatency_riscv.s
│ ├── MemoryLatency_x86.s
│ └── README.md
├── README.md
├── mt_instructionrate/
│ ├── InstructionRateFunctions.asm
│ ├── Makefile
│ ├── Project1.vcxproj
│ ├── Project1.vcxproj.filters
│ ├── arm_mt_instructionrate.c
│ ├── arm_mt_instructionrate.s
│ ├── mt_instructionrate.c
│ ├── mt_instructionrate.sln
│ ├── ppc64_mt_instructionrate.c
│ ├── ppc64_mt_instructionrate.s
│ ├── x86_mt_instructionrate
│ ├── x86_mt_instructionrate.c
│ └── x86_mt_instructionrate.s
└── svm/
├── OpenCL/
│ ├── include/
│ │ └── CL/
│ │ ├── Utils/
│ │ │ ├── Context.h
│ │ │ ├── Context.hpp
│ │ │ ├── Detail.hpp
│ │ │ ├── Device.hpp
│ │ │ ├── Error.h
│ │ │ ├── Error.hpp
│ │ │ ├── ErrorCodes.h
│ │ │ ├── Event.h
│ │ │ ├── Event.hpp
│ │ │ ├── File.h
│ │ │ ├── File.hpp
│ │ │ ├── InteropContext.hpp
│ │ │ ├── OpenCLUtilsCpp_Export.h
│ │ │ ├── OpenCLUtils_Export.h
│ │ │ ├── Platform.hpp
│ │ │ ├── Utils.h
│ │ │ └── Utils.hpp
│ │ ├── cl.h
│ │ ├── cl2.hpp
│ │ ├── cl_d3d10.h
│ │ ├── cl_d3d11.h
│ │ ├── cl_dx9_media_sharing.h
│ │ ├── cl_dx9_media_sharing_intel.h
│ │ ├── cl_egl.h
│ │ ├── cl_ext.h
│ │ ├── cl_ext_intel.h
│ │ ├── cl_function_types.h
│ │ ├── cl_gl.h
│ │ ├── cl_gl_ext.h
│ │ ├── cl_half.h
│ │ ├── cl_icd.h
│ │ ├── cl_layer.h
│ │ ├── cl_platform.h
│ │ ├── cl_va_api_media_sharing_intel.h
│ │ ├── cl_version.h
│ │ ├── opencl.h
│ │ └── opencl.hpp
│ ├── lib/
│ │ ├── OpenCL.lib
│ │ ├── OpenCLExt.lib
│ │ ├── OpenCLUtils.lib
│ │ ├── OpenCLUtilsCpp.lib
│ │ ├── OpenCLUtilsCppd.lib
│ │ ├── OpenCLUtilsd.lib
│ │ └── pkgconfig/
│ │ └── OpenCL.pc
│ └── share/
│ ├── cmake/
│ │ ├── OpenCL/
│ │ │ ├── OpenCLConfig.cmake
│ │ │ └── OpenCLConfigVersion.cmake
│ │ ├── OpenCLExtensionLoader/
│ │ │ ├── OpenCLExtensionLoaderConfig.cmake
│ │ │ ├── OpenCLExtensionLoaderConfigVersion.cmake
│ │ │ ├── OpenCLExtensionLoaderTargets-debug.cmake
│ │ │ ├── OpenCLExtensionLoaderTargets-release.cmake
│ │ │ └── OpenCLExtensionLoaderTargets.cmake
│ │ ├── OpenCLHeaders/
│ │ │ ├── OpenCLHeadersConfig.cmake
│ │ │ ├── OpenCLHeadersConfigVersion.cmake
│ │ │ └── OpenCLHeadersTargets.cmake
│ │ ├── OpenCLHeadersCpp/
│ │ │ ├── OpenCLHeadersCppConfig.cmake
│ │ │ ├── OpenCLHeadersCppConfigVersion.cmake
│ │ │ └── OpenCLHeadersCppTargets.cmake
│ │ ├── OpenCLICDLoader/
│ │ │ ├── OpenCLICDLoaderConfig.cmake
│ │ │ ├── OpenCLICDLoaderConfigVersion.cmake
│ │ │ ├── OpenCLICDLoaderTargets-debug.cmake
│ │ │ ├── OpenCLICDLoaderTargets-release.cmake
│ │ │ └── OpenCLICDLoaderTargets.cmake
│ │ ├── OpenCLUtils/
│ │ │ ├── OpenCLUtilsConfig.cmake
│ │ │ ├── OpenCLUtilsConfigVersion.cmake
│ │ │ ├── OpenCLUtilsTargets-debug.cmake
│ │ │ ├── OpenCLUtilsTargets-release.cmake
│ │ │ └── OpenCLUtilsTargets.cmake
│ │ └── OpenCLUtilsCpp/
│ │ ├── OpenCLUtilsCppConfig.cmake
│ │ ├── OpenCLUtilsCppConfigVersion.cmake
│ │ ├── OpenCLUtilsCppTargets-debug.cmake
│ │ ├── OpenCLUtilsCppTargets-release.cmake
│ │ └── OpenCLUtilsCppTargets.cmake
│ └── pkgconfig/
│ ├── OpenCL-CLHPP.pc
│ └── OpenCL-Headers.pc
├── atomic_latency_kernel.cl
├── svm.sln
├── svm.vcxproj
├── svm.vcxproj.filters
└── svmtest.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/linux.yaml
================================================
name: Build Benchmarks on Ubuntu
on: [push]
jobs:
BuildBenchmarks:
# Only Ubuntu for now.
runs-on: ubuntu-latest
steps:
- name: Install prerequisites
run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip
- name: Wild tomfoolery attempt
run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && brew install mingw-w64
- name: Check out repository code
uses: actions/checkout@v3
- name: Build all benchmarks
run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && make ci
- name: Package benchmarks
run: make package
- name: b3sum
run: b3sum clammarks.txz
# - name: Upload package
# env:
# UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }}
# UPLOAD_URL: ${{ secrets.UPLOAD_URL }}
# run: curl -X PUT -T clammarks.txz -H "$UPLOAD_KEY" "$UPLOAD_URL"
================================================
FILE: .gitignore
================================================
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
*.swp
*generatednasm*
*.exe
MemoryBandwidth/membw_*
MemoryLatency/MemoryLatency
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
clammicrobench/*.asm
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.log
*.tlog
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Nuget personal access tokens and Credentials
nuget.config
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
# VS Code files for those working on multiple tools
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
*.code-workspace
# Local History for Visual Studio Code
.history/
# Windows Installer files from build outputs
*.cab
*.msi
*.msix
*.msm
*.msp
# JetBrains Rider
.idea/
*.sln.iml
================================================
FILE: AsmGen/AsmGen.csproj
================================================
Exe
net8.0
false
x64
AnyCPU;x64
Always
Always
Always
Always
Always
Always
================================================
FILE: AsmGen/AsmGen.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.2.32516.85
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AsmGen", "AsmGen.csproj", "{B8930E86-946C-4831-B088-F571E73EEDC4}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Debug|x64 = Debug|x64
Release|Any CPU = Release|Any CPU
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU
{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64
{B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64
{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU
{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU
{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64
{B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE}
EndGlobalSection
EndGlobal
================================================
FILE: AsmGen/DataFiles/BranchhistTestBlock.c
================================================
uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int);
initializeBranchHistFuncArr();
srand(time(NULL));
size_t resultSize = sizeof(float) * maxBranchCount * testSizeCount;
float* randomResults = (float*)malloc(resultSize);
float* predictableResults = (float*)malloc(resultSize);
for (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) {
for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
uint32_t testSize = branchHistoryLengths[testSizeIdx];
uint32_t branchCount = branchCounts[branchCountIdx];
printf("Testing branch count %d history length %d\n", branchCount, testSize);
randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1);
predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0);
printf("%d, %f, %f\n", testSize,
randomResults[branchCountIdx * testSizeCount + testSizeIdx],
predictableResults[branchCountIdx * testSizeCount + testSizeIdx]);
}
}
printf("Random:\n");
printResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);
printf("\nPredictable:\n");
printResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount);
free(randomResults);
free(predictableResults);
================================================
FILE: AsmGen/DataFiles/CommonFunctions.c
================================================
// this is a partial C file that's appended into generated code
// stuff here is generic enough to work for both windows/vs and gcc
#ifndef __MINGW32__
// optional affinity setting for effed up qualcomm/android bs
#include
#include
#include
#include
#include
void setAffinity(int core) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
printf("Set affinity to core %d\n", core);
// sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
}
#endif
struct ThreadData {
int* A;
int* B;
float* fpArr;
uint32_t list_size;
uint64_t structIterations;
};
void printCsvHeader(uint32_t* xCounts, uint32_t xLen) {
printf("x");
for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) {
printf(", %d", xCounts[testSizeIdx]);
}
printf("\n");
}
// print results in format that excel can take
void printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) {
uint32_t testSizeCount = xLen;
printCsvHeader(xCounts, xLen);
for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) {
// row header
printf("%d", yCounts[branchCountIdx]);
for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) {
printf(",%f", arr[branchCountIdx * testSizeCount + testSizeIdx]);
}
printf("\n");
}
}
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
uint32_t increment = byte_increment / sizeof(uint32_t);
uint32_t element_count = list_size / increment;
for (int i = 0; i < element_count; i++) {
pattern_arr[i * increment] = i * increment;
}
int iter = element_count;
while (iter > 1) {
iter -= 1;
int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
uint32_t tmp = pattern_arr[iter * increment];
pattern_arr[iter * increment] = pattern_arr[j * increment];
pattern_arr[j * increment] = tmp;
}
}
================================================
FILE: AsmGen/DataFiles/GccBranchHistFunction.c
================================================
// this is a partial C file that's appended into generated code
// Run a test, return the result in time (ns) per branch
// historyLen: length of random array that the test loops through
// branchCountIdx: index into array of branch counts, max determined by generated header/asm
// random: if 1, randomize test array contents. If 0, fill with zeroes
float runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint32_t branchCount = branchCounts[branchCountIdx];
uint64_t iterations = 320000000 / branchCount;
uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx];
float onesCount = 0.0f;
uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);
for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {
uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen);
for (uint32_t i = 0; i < historyLen; i++) {
testArr[i] = random ? rand() % 2 : 0;
if (testArr[i] > 0)
{
onesCount += 1.0f;
}
}
testArrToArr[testArrIdx] = testArr;
}
fprintf(stderr, "Starting test, should have %0.2f percent ones\n", onesCount / ((float)historyLen * branchCount));
gettimeofday(&startTv, &startTz);
uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen);
gettimeofday(&endTv, &endTz);
uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
float latency = 1e6 * (float)time_diff_ms / (float)iterations;
// give result in latency per branch
latency = latency / branchCount;
fprintf(stderr, "History length %u, branch count %u: %0.2f percent not-taken\n", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount));
for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);
free(testArrToArr);
return latency;
}
================================================
FILE: AsmGen/DataFiles/GccIndirectBranchFunction.c
================================================
// similar but for indirect branch test
// needs indirectBranchTestFuncArr generated
// mode:
// 0 - cycle through targets
// 1 - random target selection
// 2 - jump to middle
float runIndirectBranchTest(uint32_t branchCountIdx, uint32_t targetCountIdx, uint32_t mode) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint32_t branchCount = indirectBranchCounts[branchCountIdx];
uint32_t targetCount = indirectBranchTargetCounts[targetCountIdx];
uint64_t iterations = 80000000 / branchCount;
uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t, uint64_t **) __attribute((sysv_abi)) = indirectBranchTestFuncArr[branchCountIdx][targetCountIdx];
// generate an array containing jump target indexes for every branch
uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount);
for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) {
uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * targetCount);
if (mode == 1)
for (uint32_t i = 0; i < targetCount; i++) testArr[i] = rand() % targetCount;
else if (mode == 0)
for (uint32_t i = 0; i < targetCount; i++) testArr[i] = i;
else if (mode == 2)
for (uint32_t i = 0; i < targetCount; i++) testArr[i] = targetCount / 2;
testArrToArr[testArrIdx] = testArr;
}
// each branch needs a jump table
uint64_t** jumpTables = (uint64_t**)malloc(sizeof(uint64_t*) * branchCount);
for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++)
{
uint64_t* jumpTable = (uint64_t*)malloc(sizeof(uint64_t) * targetCount);
jumpTables[jumpTableIdx] = jumpTable;
}
gettimeofday(&startTv, &startTz);
// uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch
branchtestFunc(iterations, testArrToArr, targetCount, jumpTables);
gettimeofday(&endTv, &endTz);
uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
float latency = 1e6 * (float)time_diff_ms / (float)iterations;
// give result in latency per branch
latency = latency / branchCount;
for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]);
free(testArrToArr);
for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++) free(jumpTables[jumpTableIdx]);
free(jumpTables);
return latency;
}
================================================
FILE: AsmGen/DataFiles/IndirectBranchTestBlock.c
================================================
// generated code will have:
// - indirectBranchTargetCounts = array containing # of targets per branch
// - indirectBranchCounts = array containing # of branches to test
// - maxIndirectBranchCount = length of ^^
// - initializeIndirectBranchFuncArr = populates
uint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int);
initializeIndirectBranchFuncArr();
srand(time(NULL));
size_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount;
float* results = (float*)malloc(resultSize);
float* refResults = (float*)malloc(resultSize);
for (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) {
for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) {
uint32_t testSize = indirectBranchTargetCounts[targetCountIdx];
uint32_t branchCount = indirectBranchCounts[branchCountIdx];
printf("Testing branch count %d target count %d:", branchCount, testSize);
results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0);
refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2);
printf("%f ns, reference %f ns\n",
results[branchCountIdx * testSizeCount + targetCountIdx],
refResults[branchCountIdx * testSizeCount + targetCountIdx]);
}
}
printf("Indirect branch results:\n");
printResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);
printf("Reference indirect branch results:\n");
printResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount);
free(results);
free(refResults);
================================================
FILE: AsmGen/DataFiles/clammicrobench.vcxproj_template
================================================
Debug
Win32
Release
Win32
Debug
x64
Release
x64
16.0
Win32Proj
{7e8cf2ba-57a7-4b42-b721-97e02bf9a8b8}
clammicrobench
10.0
Application
true
v142
Unicode
Application
false
v142
true
Unicode
Application
true
v142
Unicode
Application
false
v142
true
Unicode
true
false
true
false
Level3
true
WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
Level3
true
_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
%REPLACEWITHCUSTOMBUILD%
================================================
FILE: AsmGen/IUarchTest.cs
================================================
using System.Text;
namespace AsmGen
{
public interface IUarchTest
{
public const string ThreadLaunchFunctionPrefix = "ThreadLaunch_";
// enough to generate global lines, function calls, and let user pick from tests
public string Prefix { get; }
public string Description { get; }
public bool DivideTimeByCount { get; }
public bool SupportsIsa(ISA isa);
public void GenerateAsm(StringBuilder sb, ISA isa);
public void GenerateTestBlock(StringBuilder sb, ISA isa);
public void GenerateAsmGlobalLines(StringBuilder sb);
public void GenerateExternLines(StringBuilder sb);
public enum ISA
{
amd64, // 64-bit x86
aarch64, // 64-bit arm
mips64, // 64-bit MIPS, for loongson
riscv, // 64-bit risc-v
}
}
}
================================================
FILE: AsmGen/Program.cs
================================================
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading.Tasks;
namespace AsmGen
{
class Program
{
public static string DataFilesDir = "DataFiles";
static int structTestIterations = 5000000;
static int iterations = 100 * structTestIterations;
static int latencyListSize = 131072 * 1024 / 4; // 128 MB
static void Main(string[] args)
{
List tests = new List();
tests.Add(new BtbTest(4, BtbTest.BranchType.Unconditional));
tests.Add(new BtbTest(8, BtbTest.BranchType.Unconditional));
tests.Add(new BtbTest(16, BtbTest.BranchType.Unconditional));
tests.Add(new BtbTest(32, BtbTest.BranchType.Unconditional));
tests.Add(new BtbTest(64, BtbTest.BranchType.Unconditional));
tests.Add(new BtbTest(4, BtbTest.BranchType.Conditional));
tests.Add(new BtbTest(8, BtbTest.BranchType.Conditional));
tests.Add(new BtbTest(16, BtbTest.BranchType.Conditional));
tests.Add(new BtbTest(32, BtbTest.BranchType.Conditional));
tests.Add(new BranchHistoryTest());
List tasks = new List();
tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.amd64)));
tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.aarch64)));
tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.mips64)));
tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.riscv)));
tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.amd64)));
tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.aarch64)));
tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.mips64)));
tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.riscv)));
Task.WaitAll(tasks.ToArray());
GenerateMakefile();
}
static void GenerateCFile(List tests, IUarchTest.ISA isa)
{
StringBuilder sb = new StringBuilder();
sb.AppendLine("#define _GNU_SOURCE");
sb.AppendLine("#include \n#include\n#include\n#include \n#include \n#include \n");
sb.AppendLine("#pragma GCC diagnostic ignored \"-Wattributes\"");
string commonFunctions = File.ReadAllText(Path.Combine(DataFilesDir, "CommonFunctions.c"));
sb.AppendLine(commonFunctions);
foreach (IUarchTest test in tests)
{
if (test.SupportsIsa(isa))
{
test.GenerateExternLines(sb);
Console.WriteLine("Test " + test.Prefix + " supports ISA " + isa);
}
}
// no indexed addressing mode on these architectures, so make sure we can do pointer
// chasing with a single instruction
if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)
{
sb.AppendLine("extern void preplatencyarr(int *arr, uint32_t list_size);");
}
AddCommonInitCode(sb, tests, isa);
foreach (IUarchTest test in tests)
{
if (test.SupportsIsa(isa)) test.GenerateTestBlock(sb, isa);
}
AddCommonEndCode(sb);
File.WriteAllText("clammicrobench_" + isa.ToString() + ".c", sb.ToString());
}
static void GenerateAsmFile(List tests, IUarchTest.ISA isa)
{
string filename = "clammicrobench_" + isa.ToString() + ".s";
StringBuilder sb = new StringBuilder();
sb.AppendLine(".text");
if (isa == IUarchTest.ISA.mips64)
{
UarchTest.GenerateMipsPrepArrayFunction(sb);
}
else if (isa == IUarchTest.ISA.riscv)
{
UarchTest.GenerateRiscvPrepArrayFunction(sb);
}
File.WriteAllText(filename, sb.ToString());
sb.Clear();
foreach (IUarchTest test in tests)
{
if (test.SupportsIsa(isa))
{
sb.Clear();
test.GenerateAsmGlobalLines(sb);
test.GenerateAsm(sb, isa);
File.AppendAllText(filename, sb.ToString());
}
}
}
static void GenerateMakefile()
{
StringBuilder sb = new StringBuilder();
foreach (IUarchTest.ISA isa in Enum.GetValues(typeof(IUarchTest.ISA)))
{
sb.AppendLine(isa.ToString() + ":");
if (isa == IUarchTest.ISA.aarch64)
{
sb.AppendLine($"\tgcc -march=armv8.5-a+aes clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb -static");
// hack for stupid compilers that need a ton of flags to do basic things
sb.AppendLine("android:");
sb.AppendLine("\tclang -march=armv8.3-a -mfpu=neon-fp-armv8 clammicrobench_aarch64.c clammicrobench_aarch64.s -o cb");
}
else sb.AppendLine($"\tgcc -pthread clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb");
}
sb.AppendLine("win64:");
sb.AppendLine($"\tx86_64-w64-mingw32-gcc clammicrobench_{IUarchTest.ISA.amd64.ToString()}.c clammicrobench_{IUarchTest.ISA.amd64.ToString()}.s -o cb.exe");
sb.AppendLine("clean:");
sb.AppendLine("\trm clammicrobench_* cb");
File.WriteAllText("Makefile", sb.ToString());
}
// Adds largely ISA independent initialization code that gives tests a basic foundation,
// like a pointer chasing array
static void AddCommonInitCode(StringBuilder sb, List tests, IUarchTest.ISA isa)
{
sb.AppendLine("int main(int argc, char *argv[]) {");
sb.AppendLine($" uint64_t time_diff_ms, iterations = {iterations}, structIterations = {structTestIterations}, tmp;");
sb.AppendLine(" double latency; int *A = NULL, *B = NULL; float *fpArr = NULL; char *test_name = NULL; int core_affinity = -1; int threads = 1;");
sb.AppendLine(" uint64_t tmpsink;");
sb.AppendLine(" uint32_t list_size = " + latencyListSize + ";");
// print a help message based on tests available
sb.AppendLine($" printf(\"Usage: -test [test name] -listsize [latency list size = {latencyListSize}] -iterations [struct iterations = {structTestIterations}]\\n\");");
sb.AppendLine(" if (argc < 2) {");
sb.AppendLine(" printf(\"List of tests:\\n\");");
foreach (IUarchTest test in tests)
{
if (test.SupportsIsa(isa)) sb.AppendLine($" printf(\" {test.Prefix} - {test.Description}\\n\");");
}
// args provided. parse them and run test
sb.AppendLine(" } else {");
// args handling
sb.AppendLine(" for (int argIdx = 1; argIdx < argc; argIdx++) {");
sb.AppendLine(" if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1;");
sb.AppendLine(" if (strncmp(arg, \"test\", 4) == 0) { argIdx++; test_name = argv[argIdx]; }");
sb.AppendLine(" if (strncmp(arg, \"iterations\", 10) == 0) { argIdx++; iterations = 100 * atoi(argv[argIdx]); }");
sb.AppendLine(" if (strncmp(arg, \"listsize\", 8) == 0) { argIdx++; list_size = atoi(argv[argIdx]); }");
sb.AppendLine(" if (strncmp(arg, \"affinity\", 8) == 0) { argIdx++; core_affinity = atoi(argv[argIdx]); }");
sb.AppendLine(" if (strncmp(arg, \"threads\", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); }");
sb.AppendLine(" }"); // end -arg handling if
sb.AppendLine(" }"); // end args handling for loop
sb.AppendLine(" if (test_name == NULL) { fprintf(stderr, \"No test specified\\n\"); return 0; }");
// Optional affinity setting for certain troublesome platforms
// don't need a version that uses Windows affinity APIs because Windows platforms never have this issue
sb.AppendLine("#ifndef __MINGW32__");
sb.AppendLine(" if (core_affinity != -1) setAffinity(core_affinity);");
sb.AppendLine("#endif");
// Generate array for pointer chasing unless we're doing a BTB test
sb.AppendLine(" if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {");
GenerateLatencyTestArray(sb);
sb.AppendLine(" }"); // end of ptr chasing array generation
sb.AppendLine(" struct timeval startTv, endTv;");
sb.AppendLine(" struct timezone startTz, endTz;");
}
static void AddCommonEndCode(StringBuilder sb)
{
sb.AppendLine(" free(A); free(B); free(fpArr);");
sb.AppendLine(" }"); // end else
sb.AppendLine(" return 0; }");
}
static void GenerateLatencyTestArray(StringBuilder sb)
{
// Fill list to create random access pattern
sb.AppendLine(" A = (int*)malloc(sizeof(int) * list_size);");
sb.AppendLine(" srand(time(NULL));");
sb.AppendLine(" FillPatternArr(A, list_size, 64);\n");
sb.AppendLine("#ifdef _WIN32");
sb.AppendLine(" B = (int*)_aligned_malloc(sizeof(int) * list_size, 64);\n");
sb.AppendLine("#else");
sb.AppendLine(" posix_memalign((void **)&B, 64, sizeof(int) * list_size);\n");
sb.AppendLine("#endif");
sb.AppendLine(" for (int i = 0; i < list_size; i++) { B[i] = i; }\n");
sb.AppendLine("#ifdef _WIN32");
sb.AppendLine(" fpArr = (float*)_aligned_malloc(sizeof(float) * list_size, 64);\n");
sb.AppendLine("#else");
sb.AppendLine(" posix_memalign((void **)&fpArr, 64, sizeof(float) * list_size);");
sb.AppendLine("#endif");
sb.AppendLine(" for (int i = 0;i < list_size; i++) { fpArr[i] = i + .1; }\n");
}
}
}
================================================
FILE: AsmGen/Properties/launchSettings.json
================================================
{
"profiles": {
"AsmGen": {
"commandName": "Project",
"commandLineArgs": "autocopy"
}
}
}
================================================
FILE: AsmGen/README.md
================================================
# Microbenchmark Generator
C# project to generate C and assembly for CPU structure size benchmarks that use different code for each data point, making them
impractical to write by hand. For more details on methodology for out-of-order structure size measurement, see https://blog.stuffedcow.net/2013/05/measuring-rob-capacity/
First, go to Program.cs and set the expected sizes for the structures you want to measure. The constructor for each test generally has the same (low, high, step) format. For example if you anticipate ROB capacity will be between 128 and 512 entries, you can do `tests.Add(new RobTest(128, 1, 512))`
# Building
Compile the project and run AsmGen.exe. That gives several output files. Compilation for Linux:
`gcc clammicrobench.c clammicrobench_x86.s -o clammicrobench` for x86_64
`gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` for aarch64
`aarch64-linux-gnu-gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` to cross compile for aarch64 (for example from a fast desktop)
For Windows, run `AsmGen.exe autocopy`. That copies generated files to the /clammicrobench directory, assuming it's run from the default VS output location. Then, open /clammicrobench/clammicrobench.sln and build. You need nasm in your path for that, as covered on README.md at repo root.
The indirect branch test can take a while to build with nasm, so you might want to reduce the branch and target counts for that. Or just keep it commented out.
# Running
Generally, the syntax is `clammicrobench -test [test name] -listsize [list size for latency test] -iterations [iteration count]`. The last two parameters are optional.
# Tests
Running the program without parameters will spit out a list of tests and brief descriptions. Most are structure size tests. Instructions that consume certain core resources are placed between two pointer chasing loads. Once the two cache misses can't overlap, the structure being tested is full. Some tests, especially those measuring scheduler capacity, will hit a mix of instructions to see whether capacity is shared across different categories of instructions.
Alongside structure size tests, AsmGen is a convenient place to put other microbenchmarks that involve generating tons of code. There are several branch predictor tests:
- btb16Unconditional, etc: Creates a chain of taken branches in a loop to measure taken branch latency. Useful for showing BTB size and speed. Different distances between branches are useful because branch predictors sometimes have dtrouble tracking branches that are too close together.
- btb16Conditional: Same as above but with always-taken conditional branches
- branchhist - Branch history test: Generates branches that are taken or not taken in some random pattern, then increases the length of that pattern and the number of branches. Each branch is given its own pattern. This test thus tries to see how long of a pattern the branch predictor can track before getting a lot of mispredicts.
- indirectbranch - Indirect branch prediction test: Varies the number of branch targets and branches to see how many total targets the indirect branch predictor can track
- returnstack - Tests return prediction with a nested calls of varying depths. When the return stack overflows, you'll see an increase in time per call/return pair.
================================================
FILE: AsmGen/UarchTest.cs
================================================
using System.Runtime.Serialization;
using System.Text;
namespace AsmGen
{
public abstract class UarchTest : IUarchTest
{
public string Prefix { get; set; }
public string Description { get; set; }
public int[] Counts;
public string FunctionDefinitionParameters { get; set; }
public string GetFunctionCallParameters { get; set; }
public bool DivideTimeByCount { get; set; }
public abstract bool SupportsIsa(IUarchTest.ISA isa);
public abstract void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa);
public void GenerateAsmGlobalLines(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
sb.AppendLine(".global " + Prefix + Counts[i]);
}
public void GenerateExternLines(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
{
sb.AppendLine("extern uint64_t " + Prefix + Counts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");
// Function that can be launched in a pthread
sb.AppendLine($"void *{IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}(void *pa)");
sb.AppendLine("{");
sb.AppendLine(" struct ThreadData *td = (struct ThreadData *)pa;");
sb.AppendLine(" int *A = td->A;");
sb.AppendLine(" int *B = td->B;");
sb.AppendLine(" float *fpArr = td->fpArr;");
sb.AppendLine(" uint32_t list_size = td->list_size;");
sb.AppendLine(" int structIterations = td->structIterations;");
sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
sb.AppendLine(" return NULL;");
sb.AppendLine("}");
}
}
public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
{
sb.AppendLine(" if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {");
sb.AppendLine(" printf(\"" + Description + ":\\n\");");
if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv)
{
sb.AppendLine(" if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {");
sb.AppendLine("preplatencyarr(A, list_size);");
sb.AppendLine(" }");
}
for (int i = 0; i < Counts.Length; i++)
{
// use more iterations (iterations = structIterations * 100) and divide iteration count by tested-thing count
// for certain tests like call stack depth
if (DivideTimeByCount)
{
sb.AppendLine(" tmp = structIterations;");
sb.AppendLine(" structIterations = iterations / " + Counts[i] + ";");
}
sb.AppendLine(" gettimeofday(&startTv, &startTz);");
sb.AppendLine("#ifndef __MINGW32__");
sb.AppendLine(" if (threads > 1) {");
sb.AppendLine(" struct ThreadData testThreadData;");
sb.AppendLine(" pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));");
sb.AppendLine(" testThreadData.A = A;");
sb.AppendLine(" testThreadData.B = B;");
sb.AppendLine(" testThreadData.fpArr = fpArr;");
sb.AppendLine(" testThreadData.list_size = list_size;");
sb.AppendLine(" testThreadData.structIterations = structIterations;");
sb.AppendLine(" for (int threadIdx = 0; threadIdx < threads; threadIdx++) {");
sb.AppendLine($" pthread_create(testThreads + threadIdx, NULL, {IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}, &testThreadData);");
sb.AppendLine(" }");
sb.AppendLine(" for (int threadIdx = 0; threadIdx < threads; threadIdx++) {");
sb.AppendLine(" pthread_join(testThreads[threadIdx], NULL);");
sb.AppendLine(" }");
sb.AppendLine(" free(testThreads);");
// launch threads
sb.AppendLine(" } else ");
sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
sb.AppendLine("#else");
sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});");
sb.AppendLine("#endif");
sb.AppendLine(" gettimeofday(&endTv, &endTz);");
sb.AppendLine(" time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);");
//sb.AppendLine(" fprintf(stderr, \"%lu ms elapsed, %lu iter\\n\", time_diff_ms, structIterations);");
if (DivideTimeByCount)
sb.AppendLine(" latency = 1e6 * (float)time_diff_ms / (float)(iterations);");
else
sb.AppendLine(" latency = 1e6 * (float)time_diff_ms / (float)(structIterations);");
sb.AppendLine(" printf(\"" + Counts[i] + ",%f\\n\", latency);\n");
if (DivideTimeByCount)
{
sb.AppendLine(" structIterations = tmp;");
}
}
sb.AppendLine(" }\n");
}
///
/// MIPS doesn't have an indexed load instruction which means we'd have to use an
/// add+shift (extra two instructions), which would complicate measurements
/// So screw around in order to use direct addressing
///
///
public static void GenerateMipsPrepArrayFunction(StringBuilder sb)
{
// r4 = ptr to arr, r5 = arr len, in 32-bit elements
sb.AppendLine(".global preplatencyarr");
sb.AppendLine("preplatencyarr:");
sb.AppendLine(" xor $r12, $r12, $r12");
sb.AppendLine(" xor $r13, $r13, $r13");
sb.AppendLine(" xor $r14, $r14, $r14");
sb.AppendLine(" xor $r15, $r15, $r15"); // array index
sb.AppendLine(" addi.d $r14, $r14, 1");
sb.AppendLine("preplatencyarr_loop:");
sb.AppendLine(" alsl.d $r12, $r15, $r0, 0x3"); // shift by 3 = multiply by 8 for 64-bit
sb.AppendLine(" add.d $r12, $r4, $r12"); // add loaded value to base address
sb.AppendLine(" ld.d $r13, $r12, 0");
sb.AppendLine(" alsl.d $r13, $r13, $r0, 0x2"); // address calculation for loaded index. this is in 32-bit values
sb.AppendLine(" add.d $r13, $r4, $r13");
sb.AppendLine(" st.d $r13, $r12, 0"); // save calculated address
sb.AppendLine(" add.d $r15, $r15, $r14");
sb.AppendLine(" alsl.d $r16, $r15, $r0, 0x1"); // muliply 64-bit index by 2 to prevent out of bounds for 32-bit list size count
sb.AppendLine(" bne $r16, $r5, preplatencyarr_loop"); // while idx != len
sb.AppendLine(" jr $r1");
}
public static void GenerateRiscvPrepArrayFunction(StringBuilder sb)
{
sb.AppendLine(".global preplatencyarr");
sb.AppendLine("preplatencyarr:");
sb.AppendLine(" li x7, 0");
sb.AppendLine(" mv x5, x10");
sb.AppendLine("preplatencyarr_loop:");
sb.AppendLine(" ld x28, (x5)");
sb.AppendLine(" slli x28, x28, 2"); // index specified in 32-bit values
sb.AppendLine(" add x28, x28, x10");
sb.AppendLine(" sd x28, (x5)");
sb.AppendLine(" addi x5, x5, 8"); // next element
sb.AppendLine(" addi x7, x7, 2"); // list size is given in 32-bit elements
sb.AppendLine(" blt x7, x11, preplatencyarr_loop");
sb.AppendLine(" ret");
}
}
}
================================================
FILE: AsmGen/UarchTestHelpers.cs
================================================
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace AsmGen
{
public static class UarchTestHelpers
{
public static int[] GenerateCountArray(int low, int high, int step)
{
List countList = new List();
for (int i = low; i <= high; i += step)
{
countList.Add(i);
}
return countList.ToArray();
}
public static void GenerateNasmGlobalLines(StringBuilder sb, UarchTest test)
{
int[] counts = test.Counts;
for (int i = 0; i < counts.Length; i++)
sb.AppendLine("global " + test.Prefix + counts[i]);
}
public static void GenerateAsmGlobalLines(StringBuilder sb, UarchTest test)
{
int[] counts = test.Counts;
for (int i = 0; i < counts.Length; i++)
sb.AppendLine(".global " + test.Prefix + counts[i]);
}
public static void GenerateExternLines(StringBuilder sb, UarchTest test)
{
int[] counts = test.Counts;
for (int i = 0; i < counts.Length; i++)
sb.AppendLine("extern uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters}) __attribute((sysv_abi));"); ;
}
public static void GenerateVsExternLines(StringBuilder sb, UarchTest test)
{
int[] counts = test.Counts;
for (int i = 0; i < counts.Length; i++)
sb.AppendLine("extern \"C\" uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters});");
}
///
/// Generates test functions in assembly, with filler instructions between two divs
/// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention
///
/// StringBuilder to append to
/// Sizes to test the structure at
/// Function name prefix
/// Filler instructions after first ptr chasing load
/// Filler instructions after second ptr chasing load
/// If true, count pointer chasing loads as consuming the tested resource
/// (i.e. ptr chasing loads consume a ROB and integer RF slot)
/// Any extra initialization instructions
public static void GenerateX86AsmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %r9");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rcx, %r9"); // r9 <- rcx
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x10, %r14");
sb.AppendLine(" mov $0x20, %r13");
sb.AppendLine(" mov $0x30, %r12");
sb.AppendLine(" mov $0x40, %r11");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov %rdx, %rdi");
sb.AppendLine(" mov %rdx, %rsi");
sb.AppendLine("\n" + funcName + "start:");
// keep dividing list size by itself
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" mov %rdi, %rax");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" sub %rax, %rsi");
sb.AppendLine(" inc %rsi");
// rdx is the remainder, rax is the quotient
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" mov %rsi, %rax");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" sub %rax, %rdi");
sb.AppendLine(" inc %rdi");
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r9");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
public static void GenerateX86AsmDivNsqTestFuncs(StringBuilder sb,
int maxSize,
int[] counts,
string funcNamePrefix,
string[] depInstrs,
string[] indepInstrs,
bool divsInSq = false,
string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x10, %r14");
sb.AppendLine(" mov $0x20, %r13");
sb.AppendLine(" mov $0x30, %r12");
sb.AppendLine(" mov $0x40, %r11");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov %rdx, %rdi");
sb.AppendLine(" mov %rdx, %rsi");
sb.AppendLine("\n" + funcName + "start:");
// keep dividing list size by itself
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" mov %rdi, %rax"); // divide rdi by rsi
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rsi");
sb.AppendLine(" sub %rax, %rsi");
sb.AppendLine(" inc %rsi");
// rdx is the remainder, rax is the quotient
int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];
for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
{
if (fillerIdx < fillerInstrCount)
{
sb.AppendLine(depInstrs[depInstrIdx]);
depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
}
else
{
sb.AppendLine(indepInstrs[indepInstrIdx]);
indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
}
}
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" mov %rsi, %rax"); // divide rsi by rdi
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" xor %rdx, %rdx");
sb.AppendLine(" idiv %rdi");
sb.AppendLine(" sub %rax, %rdi");
sb.AppendLine(" inc %rdi");
for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
{
if (fillerIdx < fillerInstrCount)
{
sb.AppendLine(depInstrs[depInstrIdx]);
depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
}
else
{
sb.AppendLine(indepInstrs[indepInstrIdx]);
indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
}
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
public static void GenerateX86AsmStructureTestFuncs(StringBuilder sb,
int[] counts,
string funcNamePrefix,
string[] fillerInstrs1,
string[] fillerInstrs2,
bool includePtrChasingLoads = true,
string initInstrs = null,
string postLoadInstrs1 = null,
string postLoadInstrs2 = null,
bool lfence = true,
string cleanupInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x2, %r13");
sb.AppendLine(" mov $0x3, %r12");
sb.AppendLine(" mov $0x4, %r11");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
if (lfence) sb.AppendLine("lfence");
else
{
if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
if (cleanupInstrs != null) sb.AppendLine(cleanupInstrs);
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
///
/// Generate test functions to see how big a scheduler is, without a NSQ
/// Dependent ops are followed by independent ops, total op count = max
/// If number of dependent ops is greater than NSQ size, indep ops can't be executed and
/// there will be a dispatch stall
///
/// Stringbuilder to append to
/// number of ops between dependent loads. must be less than RF size but greater than SQ+NSQ size
/// array of data points to test (SQ sizes in this case)
/// function name prefix
///
///
/// Do ptr chasing loads occupy entries in the SQ being measured?
public static void GenerateX86AsmNsqTestFuncs(StringBuilder sb,
int totalOps,
int[] counts,
string funcNamePrefix,
string[] dependentInstrs,
string[] indepInstrs,
bool ptrChasingLoadsInSq = false,
string initInstrs = null,
string postLoadInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x2, %r13");
sb.AppendLine(" mov $0x3, %r12");
sb.AppendLine(" mov $0x4, %r11");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);
int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < totalOps; fillerIdx++)
{
if (fillerIdx < sqInstrs)
{
sb.AppendLine(dependentInstrs[depInstrIdx]);
depInstrIdx = (depInstrIdx + 1) % dependentInstrs.Length;
}
else
{
sb.AppendLine(indepInstrs[indepInstrIdx]);
indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
}
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine(" lfence");
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
///
/// Generate test functions for testing integer scheduler capacity
/// R15's value is dependent on the pointer chasing load results
///
///
///
///
///
///
///
///
public static void GenerateX86AsmIntSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool divs = true, string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x2, %r13");
sb.AppendLine(" mov $0x3, %r12");
sb.AppendLine(" mov $0x4, %r11");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov %rdi, %r15");
int fillerInstrCount = divs ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine(" mov %rsi, %r15");
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
///
/// Generates pointer chasing test functions in assembly, with xmm0 <- [address using offset from ptr chasing result]
/// xmm1-4 can be used for
///
///
///
///
///
///
public static void GenerateX86AsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x1, %r13");
sb.AppendLine(" mov $0x3, %r12");
// initialize some FP values off r8 (third argument)
sb.AppendLine(" movss (%r8), %xmm1");
sb.AppendLine(" movss 4(%r8), %xmm2");
sb.AppendLine(" movss 8(%r8), %xmm3");
sb.AppendLine(" movss 12(%r8), %xmm4");
sb.AppendLine(" movss 16(%r8), %xmm5");
// start one chain at 0, and the other at 0x40
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" cvtsi2ss %rdi, %xmm0");
int fillerInstrCount = counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine(" cvtsi2ss %rsi, %xmm0");
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
public static void GenerateX86AsmFp256SchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x1, %r13");
sb.AppendLine(" mov $0x3, %r12");
// initialize some FP values off r8 (third argument)
sb.AppendLine(" vzeroupper");
sb.AppendLine(" vmovups (%r8), %ymm1");
sb.AppendLine(" vmovups 32(%r8), %ymm2");
sb.AppendLine(" vmovups 64(%r8), %ymm3");
sb.AppendLine(" vmovups 96(%r8), %ymm4");
sb.AppendLine(" vmovups 128(%r8), %ymm5");
// start one chain at 0, and the other at 0x40
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" vbroadcastss (%r8,%rdi,4), %ymm0");
int fillerInstrCount = counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine(" vbroadcastss (%r8,%rsi,4), %ymm0");
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
///
/// Generates test functions in assembly, with filler instructions between two divs
/// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention
///
/// StringBuilder to append to
/// Sizes to test the structure at
/// Function name prefix
/// Filler instructions after first ptr chasing load
/// Filler instructions after second ptr chasing load
/// If true, count pointer chasing loads as consuming the tested resource
/// (i.e. ptr chasing loads consume a ROB and integer RF slot)
/// Any extra initialization instructions
public static void GenerateX86NasmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push rsi");
sb.AppendLine(" push rdi");
sb.AppendLine(" push r15");
sb.AppendLine(" push r14");
sb.AppendLine(" push r13");
sb.AppendLine(" push r12");
sb.AppendLine(" push r11");
sb.AppendLine(" xor r15, r15");
sb.AppendLine(" mov r14, 0x10");
sb.AppendLine(" mov r13, 0x20");
sb.AppendLine(" mov r12, 0x30");
sb.AppendLine(" mov r11, 0x40");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov rdi, rdx");
sb.AppendLine(" mov rsi, rdx");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" mov rax, rdi");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rsi");
sb.AppendLine(" sub rsi, rax");
sb.AppendLine(" inc rsi");
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs1[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" mov rax, rsi");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" xor rdx, rdx");
sb.AppendLine(" idiv rdi");
sb.AppendLine(" sub rdi, rax");
sb.AppendLine(" inc rdi");
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++)
{
sb.AppendLine(fillerInstrs2[instrIdx]);
instrIdx = (instrIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" dec rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop r11");
sb.AppendLine(" pop r12");
sb.AppendLine(" pop r13");
sb.AppendLine(" pop r14");
sb.AppendLine(" pop r15");
sb.AppendLine(" pop rdi");
sb.AppendLine(" pop rsi");
sb.AppendLine(" ret\n\n");
}
}
///
/// Generates test functions in ARM assembly.
/// Registers x15-x10 can be used for integer stuff
/// Args are in x0, x1, x2
///
///
///
///
///
///
///
/// use dsb as lfence
public static void GenerateArmAsmStructureTestFuncs(StringBuilder sb,
int[] counts,
string funcNamePrefix,
string[] fillerInstrs1,
string[] fillerInstrs2,
bool includePtrChasingLoads = false,
string initInstrs = null,
string postLoadInstrs1 = null,
string postLoadInstrs2 = null,
bool dsb = true)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in x0, x1
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
sb.AppendLine(" mov x10, 6");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)
{
sb.AppendLine(fillerInstrs1[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
if (dsb)
{
sb.AppendLine(" dsb sy");
sb.AppendLine(" isb sy");
}
else
{
if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++)
{
sb.AppendLine(fillerInstrs2[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs2.Length;
}
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
public static void GenerateArmAsmNsqTestFuncs(StringBuilder sb,
int totalOps,
int[] counts,
string funcNamePrefix,
string[] dependentInstrs,
string[] indepInstrs,
bool ptrChasingLoadsInSq = false,
string initInstrs = null,
string postLoadInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in x0, x1
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
sb.AppendLine(" mov x10, 6");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs);
int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i];
for (int fillerIdx = 0, instrIdx = 0; fillerIdx < totalOps; fillerIdx++)
{
if (fillerIdx < sqInstrs)
sb.AppendLine(dependentInstrs[instrIdx]);
else
sb.AppendLine(indepInstrs[instrIdx]);
instrIdx = (instrIdx + 1) % dependentInstrs.Length;
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
sb.AppendLine(" dsb sy"); // close enough to lfence
sb.AppendLine(" isb sy");
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
///
/// Filler for todo functions
///
///
///
///
public static void GenerateStub(StringBuilder sb, int[] counts, string funcNamePrefix)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" ret");
}
}
public static void GenerateArmAsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2)
{
GenerateArmAsmStructureTestFuncs(sb,
counts,
funcNamePrefix,
fillerInstrs1,
fillerInstrs2,
false,
null,
" ldr s16, [x2, w25, uxtw #2]",
" ldr s16, [x2, w26, uxtw #2]");
}
public static void GenerateArmAsmDivStructureTestFuncs(StringBuilder sb,
int[] counts,
string funcNamePrefix,
string[] fillerInstrs1,
string[] fillerInstrs2,
bool includePtrChasingLoads = false,
string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in x0 = iterations, x1 = list size, x2 = list (sink)
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov w25, w1");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs1[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" mov w26, w1");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs2[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
// Just to deal with A73
public static string GetArmDependentBranch(string prefix)
{
return $" cmp x25, x26\n b.eq {prefix}_badthing";
}
public static string GetArmDependentBranchTarget(string prefix)
{
return $"{prefix}_badthing:\n .word 0xf7f0a000";
}
public static string GetRiscvDependentBranch(string prefix)
{
return $" beq x5, x6, {prefix}_badthing";
}
public static string GetRiscvDependentBranchTarget(string prefix)
{
return $"{prefix}_badthing:\n .word 0x00000000";
}
public static void GenerateArmAsmDivNsqTestFuncs(StringBuilder sb,
int maxSize,
int[] counts,
string funcNamePrefix,
string[] depInstrs,
string[] indepInstrs,
bool divsInSq = false,
string initInstrs = null)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in x0 = iterations, x1 = list size, x2 = list (sink)
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov w25, w1");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
sb.AppendLine(" udiv w25, w25, w13");
int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i];
for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
{
if (fillerIdx < fillerInstrCount)
{
sb.AppendLine(depInstrs[depInstrIdx]);
depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
}
else
{
sb.AppendLine(indepInstrs[indepInstrIdx]);
indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
}
}
sb.AppendLine(" mov w26, w1");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" udiv w26, w26, w13");
sb.AppendLine(" mov w25, w26");
for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++)
{
if (fillerIdx < fillerInstrCount)
{
sb.AppendLine(depInstrs[depInstrIdx]);
depInstrIdx = (depInstrIdx + 1) % depInstrs.Length;
}
else
{
sb.AppendLine(indepInstrs[indepInstrIdx]);
indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length;
}
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
public static void GenerateMipsAsmStructureTestFuncs(StringBuilder sb,
int[] counts,
string funcNamePrefix,
string[] fillerInstrs1,
string[] fillerInstrs2,
bool includePtrChasingLoads = false,
string initInstrs = null,
string postLoadInstrs1 = null,
string postLoadInstrs2 = null,
bool dsb = false)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in r4 = iterations, r5 = list, r6 = list (sink)
// use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" ld.d $r12, $r5, 0");
sb.AppendLine(" ld.d $r13, $r5, 64");
sb.AppendLine(" xor $r14, $r14, $r14");
sb.AppendLine(" addi.d $r14, $r14, 1");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ld.d $r12, $r12, 0");
if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs1[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" ld.d $r13, $r13, 0");
if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs2[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs2.Length;
}
sb.AppendLine(" sub.d $r4, $r4, $r14");
sb.AppendLine(" bnez $r4, " + funcName + "start");
sb.AppendLine(" jr $r1");
}
}
public static void GenerateRiscvAsmStructureTestFuncs(StringBuilder sb,
int[] counts,
string funcNamePrefix,
string[] fillerInstrs1,
string[] fillerInstrs2,
bool includePtrChasingLoads = false,
string initInstrs = null,
string postLoadInstrs1 = null,
string postLoadInstrs2 = null,
bool fence = true)
{
for (int i = 0; i < counts.Length; i++)
{
string funcName = funcNamePrefix + counts[i];
// args in x10 = iterations, x11 = list, x12 = list (sink)
// temporaries are x5-x7, x28-x31
// x18-27 are to be saved
// use x5 and x6 for ptr chasing loads
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" addi sp, sp, -88");
sb.AppendLine(" sd x18, 0(sp)");
sb.AppendLine(" sd x19, 8(sp)");
sb.AppendLine(" sd x20, 16(sp)");
sb.AppendLine(" sd x21, 24(sp)");
sb.AppendLine(" sd x22, 32(sp)");
sb.AppendLine(" sd x23, 40(sp)");
sb.AppendLine(" sd x24, 48(sp)");
sb.AppendLine(" sd x25, 56(sp)");
sb.AppendLine(" sd x26, 64(sp)");
sb.AppendLine(" sd x27, 72(sp)");
sb.AppendLine(" addi x28, x28, 1");
sb.AppendLine(" addi x29, x29, 1");
sb.AppendLine(" addi x30, x30, 1");
sb.AppendLine(" addi x31, x31, 1");
sb.AppendLine(" addi x18, x18, 2");
sb.AppendLine(" addi x19, x19, 3");
sb.AppendLine(" addi x20, x20, 4");
sb.AppendLine(" addi x22, x21, 5");
sb.AppendLine(" ld x5, (x11)");
sb.AppendLine(" ld x6, 64(x11)");
if (initInstrs != null) sb.AppendLine(initInstrs);
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ld x5, (x5)");
if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1);
int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i];
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs1[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs1.Length;
}
sb.AppendLine(" ld x6, (x6)");
if (fence) sb.AppendLine(" fence");
else
{
if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2);
for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
sb.AppendLine(fillerInstrs2[addIdx]);
addIdx = (addIdx + 1) % fillerInstrs2.Length;
}
}
sb.AppendLine(" addi x10, x10, -1");
sb.AppendLine(" bge x10, x0, " + funcName + "start");
sb.AppendLine(" ld x18, 0(sp)");
sb.AppendLine(" ld x19, 8(sp)");
sb.AppendLine(" ld x20, 16(sp)");
sb.AppendLine(" ld x21, 24(sp)");
sb.AppendLine(" ld x22, 32(sp)");
sb.AppendLine(" ld x23, 40(sp)");
sb.AppendLine(" ld x24, 48(sp)");
sb.AppendLine(" ld x25, 56(sp)");
sb.AppendLine(" ld x26, 64(sp)");
sb.AppendLine(" ld x27, 72(sp)");
sb.AppendLine(" addi sp, sp, 88");
sb.AppendLine(" ret");
}
}
}
}
================================================
FILE: AsmGen/tests/A73RobTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
///
/// Looking for reordering capacity limits on A73 by combining several different instruction types
///
public class A73RobTest : UarchTest
{
public A73RobTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "a73rob";
this.Description = "Mixed integer/vec128 + stores";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix);
string initInstrs = " ldr q0, [x1]\n" +
" ldr q1, [x1, #0x10]\n" +
" ldr q2, [x1, #0x20]\n" +
" ldr q3, [x1, #0x30]\n" +
" ldr q4, [x1, #0x40]\n";
List fillerInstrs = new List();
for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++)
{
if (i < 33) fillerInstrs.Add(" add v1.4s, v1.4s, v0.4s");
else if (i < 66) fillerInstrs.Add(" add x15, x15, x11");
else fillerInstrs.Add(" str x12, [x2]");
}
string[] fillerInstrsArr = fillerInstrs.ToArray();
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/AddLoopTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class AddLoopTest : UarchTest
{
///
///
///
/// must be greater than 2
///
///
public AddLoopTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "addloop";
this.Description = $"ADD throughput for various loop sizes. Avoids NOP fusing";
this.FunctionDefinitionParameters = "uint64_t iterations";
this.GetFunctionCallParameters = "structIterations";
this.DivideTimeByCount = true;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return false;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
}
public void GenerateX86GccAsm(StringBuilder sb)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add %r11, %r15";
unrolledAdds[1] = " add %r11, %r14";
unrolledAdds[2] = " add %r11, %r13";
unrolledAdds[3] = " add %r11, %r12";
for (int i = 0; i < Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
sb.AppendLine(funcName + ":");
// count dec, jnz as instructions in the loop
for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);
sb.AppendLine(" dec %rdi");
sb.AppendLine(" jnz " + funcName);
sb.AppendLine(" ret");
}
}
public void GenerateArmAsm(StringBuilder sb)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x15, x15, x11";
unrolledAdds[1] = " add x14, x14, x11";
unrolledAdds[2] = " add x13, x13, x11";
unrolledAdds[3] = " add x12, x12, x11";
for (int i = 0; i < Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
sb.AppendLine(funcName + ":");
for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]);
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName);
sb.AppendLine(" ret");
}
}
}
}
================================================
FILE: AsmGen/tests/AddNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class AddNsq : UarchTest
{
private int totalOps;
public AddNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "addnsq" + totalOps;
this.Description = "Integer adds, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
// if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] depInstrs = new string[2];
depInstrs[0] = " add %rdi, %r15";
depInstrs[1] = " add %rdi, %r14";
string[] indepInstrs = new string[2];
indepInstrs[0] = " add %r13, %r11";
indepInstrs[1] = " add %r12, %r11";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false);
}
}
}
}
================================================
FILE: AsmGen/tests/AddSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class AddSchedTest : UarchTest
{
public AddSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "addsched";
this.Description = "Scheduler, Integer Adds";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add %rdi, %r15";
unrolledAdds[1] = " add %rdi, %r14";
unrolledAdds[2] = " add %rdi, %r13";
unrolledAdds[3] = " add %rdi, %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x15, x15, x25";
unrolledAdds[1] = " add x14, x14, x25";
unrolledAdds[2] = " add x13, x13, x25";
unrolledAdds[3] = " add x12, x12, x25";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add.d $r15, $r15, $r12";
unrolledAdds[1] = " add.d $r16, $r16, $r12";
unrolledAdds[2] = " add.d $r17, $r17, $r12";
unrolledAdds[3] = " add.d $r18, $r18, $r12";
string[] unrolledAdds1 = new string[4];
unrolledAdds1[0] = " add.d $r15, $r15, $r13";
unrolledAdds1[1] = " add.d $r16, $r16, $r13";
unrolledAdds1[2] = " add.d $r17, $r17, $r13";
unrolledAdds1[3] = " add.d $r18, $r18, $r13";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.riscv)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x30, x30, x5";
unrolledAdds[1] = " add x29, x29, x5";
unrolledAdds[2] = " add x28, x28, x5";
unrolledAdds[3] = " add x31, x31, x5";
string[] unrolledAdds1 = new string[4];
unrolledAdds1[0] = " add x30, x30, x6";
unrolledAdds1[1] = " add x31, x31, x6";
unrolledAdds1[2] = " add x28, x28, x6";
unrolledAdds1[3] = " add x29, x29, x6";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
}
}
}
}
================================================
FILE: AsmGen/tests/AddvNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class AddvNsq : UarchTest
{
private int totalOps;
public AddvNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "addvnsq";
this.Description = "ADDV, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
string initInstrs = " ldr d15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " addv h1, v16.4h";
depInstrs[1] = " addv h2, v16.4h";
depInstrs[2] = " addv h3, v16.4h";
depInstrs[3] = " addv h4, v16.4h";
string[] indepInstrs = new string[4];
indepInstrs[0] = " addv h1, v15.4h";
indepInstrs[1] = " addv h2, v15.4h";
indepInstrs[2] = " addv h3, v15.4h";
indepInstrs[3] = " addv h4, v15.4h";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/AddvSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class AddvSched : UarchTest
{
public AddvSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "addvsched";
this.Description = "ADDV Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]";
string postLoadInstrs2 = " ldr q16, [x2, w25, sxtw #0]";
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " addv h1, v16.4h";
unrolledInstrs[1] = " addv h2, v16.4h";
unrolledInstrs[2] = " addv h3, v16.4h";
unrolledInstrs[3] = " addv h4, v16.4h";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/AeseSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class AeseSchedTest : UarchTest
{
public AeseSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "aesesched";
this.Description = "aese scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " aesenc %xmm0, %xmm1";
unrolledAdds[1] = " aesenc %xmm0, %xmm2";
unrolledAdds[2] = " aesenc %xmm0, %xmm3";
unrolledAdds[3] = " aesenc %xmm0, %xmm4";
UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]";
string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " aese v1.16b, v0.16b";
unrolledAdds[1] = " aese v2.16b, v0.16b";
unrolledAdds[2] = " aese v3.16b, v0.16b";
unrolledAdds[3] = " aese v4.16b, v0.16b";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/AesencNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class AesencNsq : UarchTest
{
private int totalOps;
public AesencNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "aesencnsq" + totalOps;
this.Description = "AESENC, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
// if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string postLoadInstrs = " mov %rdi, %r15\n add %r8, %r15\n movdqu (%r15), %xmm1";
string initInstrs = " movdqu (%r8), %xmm2";
string[] depInstrs = new string[4];
depInstrs[0] = " aesenc %xmm1, %xmm0";
depInstrs[1] = " aesenc %xmm1, %xmm3";
depInstrs[2] = " aesenc %xmm1, %xmm4";
depInstrs[3] = " aesenc %xmm1, %xmm5";
string[] indepInstrs = new string[2];
indepInstrs[0] = " aesenc %xmm2, %xmm6";
indepInstrs[1] = " aesenc %xmm2, %xmm7";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]";
string initInstrs = " ldr s15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " fadd s0, s0, s16";
depInstrs[1] = " fadd s1, s1, s16";
depInstrs[2] = " fadd s2, s2, s16";
depInstrs[3] = " fadd s3, s3, s16";
string[] indepInstrs = new string[4];
indepInstrs[0] = " fadd s17, s17, s15";
indepInstrs[1] = " fadd s18, s18, s15";
indepInstrs[2] = " fadd s19, s19, s15";
indepInstrs[3] = " fadd s20, s20, s15";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/BranchBufferTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class BranchBufferTest : UarchTest
{
private bool mixNops;
private bool initialDependentBranch;
public BranchBufferTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "bob" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Branch Order Buffer Test (not-taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ;
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.mixNops = mixNops;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86GccAsm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
GenerateMipsAsm(sb);
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
{
string funcName = Prefix + Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x2, %r13");
sb.AppendLine(" mov $0x3, %r12");
sb.AppendLine(" mov $0x4, %r11");
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_edi_target{fillerIdx}";
sb.AppendLine($" cmp %r14, %r11");
sb.AppendLine($" je {jumpLabel}");
// try to space the jumps out a bit
if (this.mixNops) sb.AppendLine($" nop");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_esi_target{fillerIdx}";
sb.AppendLine($" cmp %r14, %r11");
sb.AppendLine($" je {jumpLabel}");
if (this.mixNops) sb.AppendLine($" nop");
// try to space the jumps out a bit
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
public void GenerateArmAsm(StringBuilder sb)
{
string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
for (int i = 0; i < Counts.Length; i++)
{
string funcName = Prefix + Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
sb.AppendLine(" mov x10, 6");
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
sb.AppendLine($" cmp x15, x10");
sb.AppendLine($" b.eq {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
sb.AppendLine($" cmp x15, x10");
sb.AppendLine($" b.eq {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
public void GenerateMipsAsm(StringBuilder sb)
{
StringBuilder ntJumpTargets = new StringBuilder();
for (int i = 0; i < Counts.Length; i++)
{
string initInstrs = " move $r15, $r0\n addi.d $r15, $r15, 15";
string funcName = this.Prefix + Counts[i];
// args in r4 = iterations, r5 = list, r6 = list (sink)
// use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" ld.d $r12, $r5, 0");
sb.AppendLine(" ld.d $r13, $r5, 64");
sb.AppendLine(" xor $r14, $r14, $r14");
sb.AppendLine(" addi.d $r14, $r14, 1");
sb.AppendLine(initInstrs);
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ld.d $r12, $r12, 0");
int fillerInstrCount = Counts[i];
for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
string jumpLabel = "dontenduphere_r12_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx;
sb.AppendLine($" beqz $r15, {jumpLabel}");
ntJumpTargets.AppendLine(jumpLabel + ":");
ntJumpTargets.AppendLine(" jr $r1");
}
sb.AppendLine(" ld.d $r13, $r13, 0");
for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++)
{
string jumpLabel = "dontenduphere_r13_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx;
sb.AppendLine($" beqz $r15, {jumpLabel}");
ntJumpTargets.AppendLine(jumpLabel + ":");
ntJumpTargets.AppendLine(" jr $r1");
}
sb.AppendLine(" sub.d $r4, $r4, $r14");
sb.AppendLine(" bnez $r4, " + funcName + "start");
sb.AppendLine(" jr $r1");
}
sb.AppendLine(ntJumpTargets.ToString());
}
}
}
================================================
FILE: AsmGen/tests/BranchHistoryTest.cs
================================================
using System.IO;
using System.Text;
namespace AsmGen
{
public class BranchHistoryTest : IUarchTest
{
public string Prefix { get; private set; }
public string Description { get; private set; }
public string FunctionDefinitionParameters { get; private set; }
public string GetFunctionCallParameters { get; private set; }
public bool DivideTimeByCount { get; private set; }
private int[] branchCounts;
private int[] historyCounts;
public BranchHistoryTest()
{
Prefix = "branchhist";
Description = "Branch predictor pattern recognition";
FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen";
GetFunctionCallParameters = "structIterations";
DivideTimeByCount = true;
branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 };
historyCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536,
2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768 };
}
public bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
if (isa == IUarchTest.ISA.mips64) GenerateMipsAsm(sb);
if (isa == IUarchTest.ISA.riscv) GenerateRiscvAsm(sb);
}
public void GenerateArmAsm(StringBuilder sb)
{
for (int i = 0; i < branchCounts.Length; i++)
{
string functionLabel = Prefix + branchCounts[i];
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + functionLabel + ":");
sb.AppendLine(" sub sp, sp, #0x40");
sb.AppendLine(" stp x11, x12, [sp, #0x30]");
sb.AppendLine(" stp x15, x16, [sp, #0x20]");
sb.AppendLine(" stp x13, x14, [sp, #0x10]");
sb.AppendLine(" eor x16, x16, x16");
sb.AppendLine(" eor x15, x15, x15");
sb.AppendLine(" eor x12, x12, x12");
sb.AppendLine(" eor x11, x11, x11");
// w14 = branch index, w16 = pattern array index
sb.AppendLine(loopLabel + ":");
sb.AppendLine(" eor w14, w14, w14");
// generate branch blocks
for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
{
string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;
sb.AppendLine(" ldr x15, [x1, w14, uxtw #3]");
sb.AppendLine(" add w14, w14, 1");
sb.AppendLine(" ldr w13, [x15, w16, uxtw #2]");
sb.AppendLine($" cbnz x13, {jumpTarget}");
sb.AppendLine(" add x12, x12, 1");
sb.AppendLine(jumpTarget + ":");
}
// increment w16, and basically cmov 0 -> w16 if w16 = list length
sb.AppendLine(" add w16, w16, 1");
sb.AppendLine(" cmp w16, w2");
sb.AppendLine(" csel w16, w11, w16, EQ");
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine($" cbnz x0, {loopLabel}");
sb.AppendLine(" mov x0, x12");
sb.AppendLine(" ldp x11, x12, [sp, #0x30]");
sb.AppendLine(" ldp x15, x16, [sp, #0x20]");
sb.AppendLine(" ldp x13, x14, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x40");
sb.AppendLine(" ret");
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int i = 0; i < branchCounts.Length; i++)
{
string functionLabel = Prefix + branchCounts[i];
sb.AppendLine("\n" + functionLabel + ":");
sb.AppendLine(" push %rbx");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %r9");
sb.AppendLine(" xor %rbx, %rbx");
sb.AppendLine(" xor %r8, %r8");
sb.AppendLine(" xor %r9, %r9");
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + loopLabel + ":");
sb.AppendLine(" xor %r11, %r11"); // set index into arr of arrs to 0
for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
{
sb.AppendLine(" mov (%rsi,%r11,8), %r10"); // load array base pointer into r10
sb.AppendLine(" inc %r11");
sb.AppendLine(" mov (%r10,%rbx,4), %eax "); // read element from branch history test array
sb.AppendLine(" test %eax, %eax");
// conditional branch on test array value
string zeroLabel = Prefix + branchCounts[i] + "_zero" + branchCount;
sb.AppendLine(" jz " + zeroLabel);
sb.AppendLine(" inc %r8"); // r8 is just a sink here
sb.AppendLine(zeroLabel + ":");
}
// loop around in pattern history test array if necessary
// avoiding an extra branch to not pollute BPU history
sb.AppendLine(" inc %rbx");
sb.AppendLine(" cmp %rbx, %rdx");
sb.AppendLine(" cmove %r9, %rbx");
// end of main loop over iteration count
sb.AppendLine(" dec %rdi");
sb.AppendLine(" jnz " + loopLabel);
// function epilogue
sb.AppendLine(" mov %r8, %rax");
sb.AppendLine(" pop %r9");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %rbx");
sb.AppendLine(" ret");
}
}
public void GenerateMipsAsm(StringBuilder sb)
{
// Generate an array of branch history test functions, one for each branch count
for (int i = 0; i < branchCounts.Length; i++)
{
// branchtestFunc(iterations, testArrToArr, historyLen)
// r4 = iterations, r5 = array of pointers to pattern arrays for each branch, r6 = history length (length of each array)
// temporary registers: r12-r20
// write code here
string functionLabel = Prefix + branchCounts[i];
sb.AppendLine("\n" + functionLabel + ":");
// r12 = branch index, r13 = index into pattern array
sb.AppendLine(" move $r13, $r0");
sb.AppendLine(" move $r18, $r0");
sb.AppendLine(" move $r20, $r0");
sb.AppendLine(" addi.d $r20, $r20, 1");
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + loopLabel + ":");
sb.AppendLine(" move $r12, $r0"); // set branch index to zero
// generate branch blocks
for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
{
string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;
// load the branch's pattern array
sb.AppendLine(" alsl.d $r14, $r12, $r0, 0x3"); // get offset into array in bytes, using r12 as array index.
sb.AppendLine(" add.d $r14, $r14, $r5"); // get address into r14
sb.AppendLine(" ld.d $r15, $r14, 0"); // r15 = base address of curent branch's target array
sb.AppendLine(" addi.d $r12, $r12, 1"); // next branch
// load element from pattern array indicating where we should branch
sb.AppendLine(" alsl.d $r16, $r13, $r0, 0x2"); // use r13 to index into pattern array
sb.AppendLine(" add.d $r16, $r16, $r15"); // r16 = address of element we want to load
sb.AppendLine(" ld.w $r17, $r16, 0");
sb.AppendLine($" bnez $r17, {jumpTarget}"); // branch if 1
sb.AppendLine(" addi.d $r18, $r18, 1");
sb.AppendLine(jumpTarget + ":");
}
// increment w16, and basically cmov 0 -> w16 if w16 = list length
// increment r13 (idx into pattern array)
sb.AppendLine(" addi.d $r13, $r13, 1");
sb.AppendLine(" sub.d $r19, $r6, $r13"); // r19 = history length - index
sb.AppendLine(" maskeqz $r13, $r13, $r19"); // set index back to 0 to repeat pattern, if history length - index == 0
sb.AppendLine(" sub.d $r4, $r4, $r20"); // decrement iteration count
sb.AppendLine($" bnez $r4, {loopLabel}");
sb.AppendLine(" move $r4, $r18"); // return the count of NT branches for tracking RNG quality
sb.AppendLine(" jr $r1");
}
}
public void GenerateRiscvAsm(StringBuilder sb)
{
// Generate an array of branch history test functions, one for each branch count
for (int i = 0; i < branchCounts.Length; i++)
{
// branchtestFunc(iterations, testArrToArr, historyLen)
// a0 = iterations, a1 = array of pointers to pattern arrays for each branch, a2 = length of each array (history length)
// t0-t7 temporary registers
// write code here
string functionLabel = Prefix + branchCounts[i];
sb.AppendLine("\n" + functionLabel + ":");
sb.AppendLine(" addi sp, sp, -16");
sb.AppendLine(" sd s0, (sp)");
// t1 = index into pattern array
sb.AppendLine(" li t1, 0");
sb.AppendLine(" li t6, 0");
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + loopLabel + ":");
sb.AppendLine(" mv t2, a1"); // start of array of pointers to pattern arrays
// generate branchCount blocks, each of which traverses its own array
for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++)
{
string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount;
// load the branch's pattern array (a1 -> ptr -> array)
sb.AppendLine(" ld t3, (t2)"); // load pointer to array
// t3 = base address of branch's array
sb.AppendLine(" slli t4, t1, 2");
sb.AppendLine(" add t4, t4, t3");
sb.AppendLine(" lw t5, (t4)"); // should have 1 or 0
sb.AppendLine(" addi t2, t2, 8"); // next branch
sb.AppendLine($" beq t5, x0, {jumpTarget}");
sb.AppendLine(" addi t6, t6, 1"); // dummy increment to track not-taken/taken branch ratio
sb.AppendLine(jumpTarget + ":");
}
sb.AppendLine(" addi t1, t1, 1"); // increment array index
sb.AppendLine(" slt s0, t1, a2"); // 1 if within range
sb.AppendLine(" mul t1, t1, s0"); // multiply by 1 if within range, 0 otherwise
// decrement iteration count
sb.AppendLine(" addi a0, a0, -1");
sb.AppendLine($" bne a0, x0, {loopLabel}");
sb.AppendLine(" mv a0, t6");
sb.AppendLine(" ld s0, (sp)");
sb.AppendLine(" addi sp, sp, 16");
sb.AppendLine(" ret");
}
}
public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
{
sb.AppendLine(" if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {");
sb.AppendLine(" printf(\"" + Description + ":\\n\");");
GenerateCommonTestBlock(sb);
sb.AppendLine(" }\n");
}
public void GenerateAsmGlobalLines(StringBuilder sb)
{
for (int i = 0; i < branchCounts.Length; i++)
sb.AppendLine(".global " + Prefix + branchCounts[i]);
}
// kinda hack this to put in initialization code we need
public void GenerateExternLines(StringBuilder sb)
{
for (int i = 0; i < branchCounts.Length; i++)
sb.AppendLine("extern uint64_t " + Prefix + branchCounts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");
GenerateInitializationCode(sb, true);
string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccBranchHistFunction.c"));
sb.AppendLine(gccFunction);
}
public void GenerateInitializationCode(StringBuilder sb, bool gcc)
{
sb.AppendLine($"uint32_t maxBranchCount = {branchCounts.Length};");
sb.Append($"uint32_t branchCounts[{branchCounts.Length}] = ");
sb.Append("{ " + branchCounts[0]);
for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]);
sb.AppendLine(" };");
sb.Append($"uint32_t branchHistoryLengths[{historyCounts.Length}] = ");
sb.Append("{ " + historyCounts[0]);
for (int i = 1; i < historyCounts.Length; i++) sb.Append(", " + historyCounts[i]);
sb.AppendLine(" };");
if (gcc) sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);");
else sb.AppendLine($"uint64_t (*branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);");
sb.AppendLine("void initializeBranchHistFuncArr() {");
for (int i = 0; i < branchCounts.Length; i++)
{
sb.AppendLine($" branchtestFuncArr[{i}] = {Prefix + branchCounts[i]};");
}
sb.AppendLine("}");
}
public void GenerateCommonTestBlock(StringBuilder sb)
{
string branchhistMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "BranchhistTestBlock.c"));
sb.AppendLine(branchhistMain);
}
}
}
================================================
FILE: AsmGen/tests/BtbTest.cs
================================================
using System;
using System.Text;
namespace AsmGen
{
public class BtbTest : UarchTest
{
private int spacing;
private BranchType branchType;
private bool varyspacing;
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public enum BranchType
{
///
/// Conditional branches that are always taken
///
Conditional,
///
/// Unconditional jmps
///
Unconditional,
///
/// A mix of both to max out Zen 2's BTB capacity
/// Optimization guide says one entry can track two branches if they're in the same 64B line
/// and the first is conditional
///
ZenMix
}
///
/// Constructor for BTB test
///
/// How far apart branches should be. Valid values are 4, 8, 16
/// If true, use conditional branches (still always taken)
public BtbTest(int spacing, BranchType branchType, bool varyspacing = false)
{
this.Counts = new int[] { 1, 2, 4, 8, 16, 32, 48, 56, 64, 128, 256, 512, 768, 1024, 1536, 2048,
3072, 4096, 4608, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, 28672, 32768, 40960, 49152 };
this.Prefix = "btb" + spacing + (varyspacing ? "v" : "") + branchType;
this.Description = $"Branch Target Buffer, " + branchType + $" branch every {spacing} bytes " + (varyspacing ? " (varied spacing)" : "");
this.FunctionDefinitionParameters = "uint64_t iterations";
this.GetFunctionCallParameters = "structIterations";
this.DivideTimeByCount = true;
this.spacing = spacing;
this.branchType = branchType;
this.varyspacing = varyspacing;
}
private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }
public string GetLabelName(string funcName, int part) { return funcName + "part" + part; }
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86GccAsm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
else if (isa == IUarchTest.ISA.mips64)
{
GenerateMipsAsm(sb);
}
else if (isa == IUarchTest.ISA.riscv)
{
GenerateRiscvAsm(sb);
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
string paddingAlign = " .align " + spacing;
int spacingNops = 0;
for (int i = 0; i < Counts.Length; i++)
{
string funcName = GetBranchFuncName(Counts[i]);
//sb.AppendLine("; Start of function for branch count " + branchCounts[i] + " padding " + paddings[p]);
sb.AppendLine(funcName + ":\n");
sb.AppendLine(" xor %rax, %rax");
if (branchType == BranchType.ZenMix) sb.AppendLine(" .align 64");
for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
{
string labelName = GetLabelName(funcName, branchIdx);
if (branchType == BranchType.Conditional)
{
sb.AppendLine(" test %rax, %rax");
sb.AppendLine(" jz " + labelName); // should always be set
}
else if (branchType == BranchType.Unconditional)
{
sb.AppendLine(" jmp " + labelName);
}
else if (branchType == BranchType.ZenMix)
{
if ((branchIdx & 0x1) == 0)
{
sb.AppendLine(" jmp " + labelName);
}
else
{
sb.AppendLine(" test %rax, %rax");
sb.AppendLine(" jz " + labelName);
}
}
sb.AppendLine(paddingAlign);
if (varyspacing)
{
for (int nopIdx = 0; nopIdx < spacingNops; nopIdx++)
{
sb.AppendLine(" nop");
}
spacingNops++;
if (spacingNops > 6) spacingNops = 0;
}
sb.AppendLine(labelName + ":");
}
sb.AppendLine(" dec %rdi");
sb.AppendLine(" jne " + funcName);
sb.AppendLine(" ret\n\n");
// don't let it get too close to the next branch
sb.AppendLine(paddingAlign);
}
}
private string Get4BNopAlign()
{
string paddingAlign = "";
if (spacing == 8)
{
paddingAlign = " nop";
}
else if (spacing == 16)
{
paddingAlign = " nop\n nop\n nop";
}
else if (spacing == 32)
{
paddingAlign = " nop\n nop\n nop\n nop\n nop\n nop\n nop";
}
else if (spacing == 64)
{
paddingAlign = " nop\n nop\n nop\n nop\n nop\n nop\n nop\n";
paddingAlign += " nop\n nop\n nop\n nop\n nop\n nop\n nop\n nop";
}
else if (spacing != 4)
{
Console.WriteLine($"Unsupported padding value {spacing}");
throw new NotImplementedException("Unsupported padding value");
}
return paddingAlign;
}
public void GenerateArmAsm(StringBuilder sb)
{
// things are 4 bytes on aarch64
string paddingAlign = Get4BNopAlign();
for (int i = 0; i < Counts.Length; i++)
{
string funcName = GetBranchFuncName(Counts[i]);
string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";
sb.AppendLine(funcName + ":");
sb.AppendLine($" adrp x2, {funcName}");
sb.AppendLine($" add x2, x2, :lo12:{funcName}");
sb.AppendLine(" mov x1, 1");
sb.AppendLine(".align 16");
sb.AppendLine(funcTargetName + ":");
for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
{
string labelName = GetLabelName(funcName, branchIdx);
if (branchType == BranchType.Unconditional)
sb.AppendLine(" b " + labelName);
else if (branchType == BranchType.Conditional)
sb.AppendLine(" cbnz x1, " + labelName); // x1 = 1 from earlier, should never be zero
else if (branchType == BranchType.ZenMix)
{
if ((branchIdx & 0x1) == 0) sb.AppendLine(" b " + labelName);
else sb.AppendLine(" cbnz x1, " + labelName);
}
sb.AppendLine(paddingAlign);
sb.AppendLine(labelName + ":");
}
sb.AppendLine(paddingAlign);
sb.AppendLine(" sub x0, x0, 1");
// aarch64 is a mess. try to avoid 'relocation truncated to fit' issues with an indirect branch
if (spacing * Counts[i] >= (1024 * 1024 - 20))
{
string workaroundTarget = funcName + "_aarch64_indirect_workaround";
// jump over indirect branch to return, on zero
// this branch should be not taken for all except the last iteration, and should have minimal
// impact on results because a predicted NT branch is sort of 'free' on most architectures
sb.AppendLine(" cbz x0, " + workaroundTarget);
sb.AppendLine(" br x2");
sb.AppendLine(workaroundTarget + ":");
}
else
{
sb.AppendLine(" cbnz x0, " + funcTargetName);
}
sb.AppendLine(" ret\n\n");
// don't let it get too close to the next branch
sb.AppendLine(paddingAlign);
}
}
public void GenerateMipsAsm(StringBuilder sb)
{
string paddingAlign = Get4BNopAlign();
for (int i = 0; i < Counts.Length; i++)
{
string funcName = GetBranchFuncName(Counts[i]);
string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";
sb.AppendLine(funcName + ":");
sb.AppendLine(" xor $r12, $r12, $r12");
sb.AppendLine(" addi.d $r12, $r12, 1");
sb.AppendLine(" xor $r13, $r13, $r13");
sb.AppendLine(" la $r14, " + funcTargetName);
sb.AppendLine(funcTargetName + ":");
for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
{
string labelName = GetLabelName(funcName, branchIdx);
sb.AppendLine(" beqz $r13, " + labelName);
sb.AppendLine(paddingAlign);
sb.AppendLine(labelName + ":");
}
sb.AppendLine(" sub.d $r4, $r4, $r12"); // decrement iteration count
int distance = spacing * Counts[i];
if (distance < 1024)
{
sb.AppendLine(" bnez $r4, " + funcTargetName); // short branch if we're not too far away
}
else
{
string workaroundTarget = funcName + "_mips_indirect_workaround";
sb.AppendLine(" beqz $r4, " + workaroundTarget); // jump over indirect branch if iteration count is reached
sb.AppendLine(" jr $r14"); // jump back to target (start of loop)
sb.AppendLine(workaroundTarget + ":");
}
sb.AppendLine(" jr $r1");
}
}
private string GetRiscvNopAlign()
{
// branch takes 16 bits (2 bytes)
int paddingNeeded = spacing - 2;
// each NOP is 2 bytes
StringBuilder nopSb = new StringBuilder();
for (int i = 0; i < paddingNeeded; i += 2)
{
nopSb.AppendLine(" nop");
}
return nopSb.ToString();
}
public void GenerateRiscvAsm(StringBuilder sb)
{
string paddingAlign = GetRiscvNopAlign();
for (int i = 0; i < Counts.Length; i++)
{
string funcName = GetBranchFuncName(Counts[i]);
string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget";
sb.AppendLine(funcName + ":");
sb.AppendLine(" la x5, " + funcTargetName);
sb.AppendLine(funcTargetName + ":");
for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++)
{
string labelName = GetLabelName(funcName, branchIdx);
sb.AppendLine(" j " + labelName);
sb.AppendLine(paddingAlign);
sb.AppendLine(labelName + ":");
}
sb.AppendLine(" addi x10, x10, -1"); // decrement iteration count
int distance = spacing * Counts[i];
if (distance < 1024)
{
sb.AppendLine(" bne x10, x0, " + funcTargetName); // short branch if we're not too far away
}
else
{
string workaroundTarget = funcName + "_riscv_indirect_workaround";
sb.AppendLine(" beq x10, x0, " + workaroundTarget); // jump over indirect branch if iteration count is reached
sb.AppendLine(" jalr x0, x5"); // jump back to target (start of loop)
sb.AppendLine(workaroundTarget + ":");
}
sb.AppendLine(" ret");
}
}
}
}
================================================
FILE: AsmGen/tests/CvtSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class CvtSchedTest : UarchTest
{
public CvtSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "cvtsched";
this.Description = "F2I Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " cvtsi2ss %rdi, %xmm1";
unrolledInstrs[1] = " cvtsi2ss %rdi, %xmm2";
unrolledInstrs[2] = " cvtsi2ss %rdi, %xmm3";
unrolledInstrs[3] = " cvtsi2ss %rdi, %xmm4";
string[] unrolledInstrs1 = new string[4];
unrolledInstrs1[0] = " cvtsi2ss %rsi, %xmm1";
unrolledInstrs1[1] = " cvtsi2ss %rsi, %xmm2";
unrolledInstrs1[2] = " cvtsi2ss %rsi, %xmm3";
unrolledInstrs1[3] = " cvtsi2ss %rsi, %xmm4";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " scvtf s0, w25";
unrolledInstrs[1] = " scvtf s1, w25";
unrolledInstrs[2] = " scvtf s2, w25";
unrolledInstrs[3] = " scvtf s3, w25";
string[] unrolledInstrs1 = new string[4];
unrolledInstrs1[0] = " scvtf s0, w26";
unrolledInstrs1[1] = " scvtf s1, w26";
unrolledInstrs1[2] = " scvtf s2, w26";
unrolledInstrs1[3] = " scvtf s3, w26";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1);
}
else if (isa == IUarchTest.ISA.mips64)
{
}
else if (isa == IUarchTest.ISA.riscv)
{
}
}
}
}
================================================
FILE: AsmGen/tests/FAdd256RfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class Fadd256RfTest : UarchTest
{
public enum TestMode
{
none,
setavx512regs,
pendingavx512instr
}
private bool populateAvx512Regs;
private bool pendingAvx512Instr;
public Fadd256RfTest(int low, int high, int step, TestMode mode)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fadd256rf" + mode;
this.Description = "256-bit FP/vector RF capacity, " + mode;
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
if (mode == TestMode.setavx512regs) populateAvx512Regs = true;
else if (mode == TestMode.pendingavx512instr) pendingAvx512Instr = true;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vmovups (%r8), %ymm0\n" +
" vmovups %ymm0, %ymm1\n" +
" vmovups %ymm0, %ymm2\n" +
" vmovups %ymm0, %ymm3\n" +
" vmovups %ymm0, %ymm4\n";
if (this.populateAvx512Regs)
{
for (int i = 5; i < 32; i++)
{
initInstrs += " vmovups 64(%r8), %zmm" + i + "\n";
}
}
string postLoadInstr = string.Empty;
if (this.pendingAvx512Instr)
{
initInstrs += " vmovups 64(%r8), %zmm5\n vmovups 128(%r8), %zmm6\n";
postLoadInstr = " vaddps %zmm5, %zmm6, %zmm6";
}
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1";
unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2";
unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3";
unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);
}
else if (isa == IUarchTest.ISA.aarch64)
{
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = "";
for (int regIdx = 0; regIdx < 32; regIdx++)
{
initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
}
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " xvfadd.s $xr1, $xr1, $xr1";
unrolledAdds[1] = " xvfadd.s $xr2, $xr2, $xr2";
unrolledAdds[2] = " xvfadd.s $xr3, $xr3, $xr3";
unrolledAdds[3] = " xvfadd.s $xr4, $xr4, $xr4";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/Fadd128RfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class Fadd128RfTest : UarchTest
{
private bool initialDependentBranch;
public Fadd128RfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fadd128rf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "128-bit FP/vector RF capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return false;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vmovups (%r8), %ymm0\n";
for (int i = 1; i < 16; i++) initInstrs += $" vmovups %ymm0, %ymm{i}\n";
List unrolledAddsList = new List();
for (int i = 1; i < 16; i++) unrolledAddsList.Add($" vaddps %ymm0, %ymm{i}, %ymm{i}");
string[] unrolledAdds = unrolledAddsList.ToArray();
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string initInstrs = " ldr q0, [x1]\n" +
" ldr q1, [x1, #0x10]\n" +
" ldr q2, [x1, #0x20]\n" +
" ldr q3, [x1, #0x30]\n" +
" ldr q4, [x1, #0x40]\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s";
unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s";
unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s";
unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.riscv)
{
string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n vlw.v v1, (a1)\n vlw.v v2, (a1)\n vlw.v v3, (a1)";
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
postLoadInstrs += "\n mv t6, a2";
string[] unrolledInstrs = new string[1];
unrolledInstrs[0] = " vfadd.vv v0, v0, v0";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,
initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/Fadd128SchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class Fadd128SchedTest : UarchTest
{
public Fadd128SchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fadd128sched";
this.Description = "128-bit Vector FP Add Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " addps %xmm0, %xmm1";
unrolledAdds[1] = " addps %xmm0, %xmm2";
unrolledAdds[2] = " addps %xmm0, %xmm3";
unrolledAdds[3] = " addps %xmm0, %xmm4";
UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]";
string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s";
unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s";
unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s";
unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/Fadd256SchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class Fadd256SchedTest : UarchTest
{
public Fadd256SchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fadd256sched";
this.Description = "256-bit FP add scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// ymm0 is dependent on ptr chasing load
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1";
unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2";
unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3";
unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3";
UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
else if (isa == IUarchTest.ISA.aarch64)
{
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = "";
for (int regIdx = 0; regIdx < 32; regIdx++)
{
initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
}
initInstrs += " move $r16, $r0\n addi.d $r16, $r16, 0xF"; // load mask into r16
string postLoadInstrs1 = " and $r15, $r12, $r16\n xvldx $xr1, $r6, $r15";
string postLoadInstrs2 = " and $r15, $r13, $r16\n xvldx $xr1, $r6, $r15";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " xvfadd.s $xr2, $xr2, $xr1";
unrolledAdds[1] = " xvfadd.s $xr3, $xr3, $xr1";
unrolledAdds[2] = " xvfadd.s $xr4, $xr4, $xr1";
unrolledAdds[3] = " xvfadd.s $xr5, $xr5, $xr1";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/FaddNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class FaddNsq : UarchTest
{
private int totalOps;
public FaddNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "faddnsq" + totalOps;
this.Description = "FADD, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string postLoadInstrs = " cvtsi2ss %edi, %xmm1";
string initInstrs = " cvtsi2ss %r12, %xmm2";
string[] depInstrs = new string[4];
depInstrs[0] = " addss %xmm1, %xmm0";
depInstrs[1] = " addss %xmm1, %xmm3";
depInstrs[2] = " addss %xmm1, %xmm4";
depInstrs[3] = " addss %xmm1, %xmm5";
string[] indepInstrs = new string[2];
indepInstrs[0] = " addss %xmm2, %xmm6";
indepInstrs[1] = " addss %xmm2, %xmm7";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]";
string initInstrs = " ldr s15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " fadd s0, s0, s16";
depInstrs[1] = " fadd s1, s1, s16";
depInstrs[2] = " fadd s2, s2, s16";
depInstrs[3] = " fadd s3, s3, s16";
string[] indepInstrs = new string[4];
indepInstrs[0] = " fadd s17, s17, s15";
indepInstrs[1] = " fadd s18, s18, s15";
indepInstrs[2] = " fadd s19, s19, s15";
indepInstrs[3] = " fadd s20, s20, s15";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/FaddSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class FaddSchedTest : UarchTest
{
public FaddSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "faddsched";
this.Description = "FP Add Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " addss %xmm0, %xmm1";
unrolledAdds[1] = " addss %xmm0, %xmm2";
unrolledAdds[2] = " addss %xmm0, %xmm3";
unrolledAdds[3] = " addss %xmm0, %xmm4";
UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd s17, s17, s16";
unrolledAdds[1] = " fadd s18, s18, s16";
unrolledAdds[2] = " fadd s19, s19, s16";
unrolledAdds[3] = " fadd s20, s20, s16";
UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = " fld.s $f8, $r6, 0\n" +
" fld.s $f9, $r6, 4\n" +
" fld.s $f10, $r6, 8\n" +
" fld.s $f11, $r6, 12\n" +
" fld.s $f12, $r6, 16\n";
string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0";
string[] dependentAdds = new string[4];
dependentAdds[0] = " fadd.s $f9, $f9, $f8";
dependentAdds[1] = " fadd.s $f10, $f10, $f8";
dependentAdds[2] = " fadd.s $f11, $f11, $f8";
dependentAdds[3] = " fadd.s $f12, $f12, $f8";
string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, dependentAdds, dependentAdds, includePtrChasingLoads: false, initInstrs: initInstrs,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
else if (isa == IUarchTest.ISA.riscv)
{
string initInstrs = " fld f0, (x12)\n" +
" fld f1, 8(x12)\n" +
" fld f2, 16(x12)\n" +
" fld f3, 24(x12)\n" +
" fld f4, 32(x12)\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd.s f0, f0, f4";
unrolledAdds[1] = " fadd.s f1, f1, f4";
unrolledAdds[2] = " fadd.s f2, f2, f4";
unrolledAdds[3] = " fadd.s f3, f3, f4";
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12\n fld f4, (x7)";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12\n fld f4, (x7)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,
initInstrs, postLoadInstrs1, postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/FcmpSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class FcmpSchedTest : UarchTest
{
public FcmpSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fcmpsched";
this.Description = "FCMP Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fcmp s17, s16";
unrolledAdds[1] = " fcmp s19, s16";
unrolledAdds[2] = " fcmp s19, s16";
unrolledAdds[3] = " fcmp s20, s16";
UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
}
}
}
================================================
FILE: AsmGen/tests/FlagRfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class FlagRfTest : UarchTest
{
private bool initialDependentBranch;
public FlagRfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "flagrf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Flags Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return false;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[1];
unrolledAdds[0] = " test %r15, %r14";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] unrolledAdds = new string[1];
unrolledAdds[0] = " cmp x14, x15";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/Fma256SchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class Fma256SchedTest : UarchTest
{
public Fma256SchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fma256sched";
this.Description = "256-bit FP add scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return false;
if (isa == IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// ymm0 is dependent on ptr chasing load
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1";
unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2";
unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3";
unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3";
UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
else if (isa == IUarchTest.ISA.aarch64)
{
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = "";
for (int regIdx = 0; regIdx < 32; regIdx++)
{
initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
}
initInstrs += " move $r16, $r0\n addi.d $r16, $r16, 0xF"; // load mask into r16
string postLoadInstrs1 = " and $r15, $r12, $r16\n xvldx $xr1, $r6, $r15";
string postLoadInstrs2 = " and $r15, $r13, $r16\n xvldx $xr1, $r6, $r15";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " xvfmadd.s $xr2, $xr2, $xr2, $xr1";
unrolledAdds[1] = " xvfmadd.s $xr3, $xr3, $xr3, $xr1";
unrolledAdds[2] = " xvfmadd.s $xr4, $xr4, $xr4, $xr1";
unrolledAdds[3] = " xvfmadd.s $xr5, $xr5, $xr5, $xr1";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/FmovSched.cs
================================================
using System.Text;
namespace AsmGen
{
public class FmovSched : UarchTest
{
public FmovSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fmovsched";
this.Description = "FMOV vec to gpr Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]";
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " fmov x15, d16";
unrolledInstrs[1] = " fmov x14, d16";
unrolledInstrs[2] = " fmov x13, d16";
unrolledInstrs[3] = " fmov x12, d16";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/FmulSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class FmulSchedTest : UarchTest
{
public FmulSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fmulsched";
this.Description = "FP (32-bit multiply) Scheduler Capacity Test";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86Asm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
else if (isa == IUarchTest.ISA.riscv)
{
GenerateRiscvAsm(sb);
}
}
public void GenerateX86Asm(StringBuilder sb)
{
// xmm0 is dependent on ptr chasing load
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " mulss %xmm0, %xmm1";
unrolledAdds[1] = " mulss %xmm0, %xmm2";
unrolledAdds[2] = " mulss %xmm0, %xmm3";
unrolledAdds[3] = " mulss %xmm0, %xmm4";
UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
public void GenerateArmAsm(StringBuilder sb)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fmul s17, s17, s16";
unrolledAdds[1] = " fmul s18, s18, s16";
unrolledAdds[2] = " fmul s19, s19, s16";
unrolledAdds[3] = " fmul s20, s20, s16";
UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds);
}
public void GenerateRiscvAsm(StringBuilder sb)
{
string initInstrs = " fld f0, (x12)\n" +
" fld f1, 8(x12)\n" +
" fld f2, 16(x12)\n" +
" fld f3, 24(x12)\n" +
" fld f4, 32(x12)\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fmul.s f0, f0, f4";
unrolledAdds[1] = " fmul.s f1, f1, f4";
unrolledAdds[2] = " fmul.s f2, f2, f4";
unrolledAdds[3] = " fmul.s f3, f3, f4";
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12\n fld f4, (x7)";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12\n fld f4, (x7)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false,
initInstrs, postLoadInstrs1, postLoadInstrs2);
}
}
}
================================================
FILE: AsmGen/tests/FpRfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class FpRfTest : UarchTest
{
private bool initialDependentBranch;
public FpRfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fprf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " movss (%r8), %xmm1\n" +
" movss 4(%r8), %xmm2\n" +
" movss 8(%r8), %xmm3\n" +
" movss 12(%r8), %xmm4\n" +
" movss 16(%r8), %xmm5\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " addss %xmm1, %xmm2";
unrolledAdds[1] = " addss %xmm1, %xmm3";
unrolledAdds[2] = " addss %xmm1, %xmm4";
unrolledAdds[3] = " addss %xmm1, %xmm5";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string initInstrs = " ldr s17, [x2]\n" +
" ldr s18, [x2, 4]\n" +
" ldr s19, [x2, 8]\n" +
" ldr s20, [x2, 12]\n" +
" ldr s21, [x2, 16]\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd s18, s18, s17";
unrolledAdds[1] = " fadd s19, s19, s17";
unrolledAdds[2] = " fadd s20, s20, s17";
unrolledAdds[3] = " fadd s21, s21, s17";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = " fld.s $f8, $r6, 0\n" +
" fld.s $f9, $r6, 4\n" +
" fld.s $f10, $r6, 8\n" +
" fld.s $f11, $r6, 12\n" +
" fld.s $f12, $r6, 16\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd.s $f9, $f9, $f8";
unrolledAdds[1] = " fadd.s $f10, $f10, $f8";
unrolledAdds[2] = " fadd.s $f11, $f11, $f8";
unrolledAdds[3] = " fadd.s $f12, $f12, $f8";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
}
else if (isa == IUarchTest.ISA.riscv)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
string initInstrs = " fld f0, (x12)\n" +
" fld f1, 8(x12)\n" +
" fld f2, 16(x12)\n" +
" fld f3, 24(x12)\n" +
" fld f4, 32(x12)\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd.s f0, f0, f4";
unrolledAdds[1] = " fadd.s f1, f1, f4";
unrolledAdds[2] = " fadd.s f2, f2, f4";
unrolledAdds[3] = " fadd.s f3, f3, f4";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds,
includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/FpStoreDataNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class FpStoreDataNsqTest : UarchTest
{
public FpStoreDataNsqTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fpstoredatansq" + high;
this.Description = "Store FP 32-bit data scheduler capacity, excluding nsq";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vzeroupper\n vpcmpeqd %xmm2, %xmm2, %xmm2\n vpxor %xmm2, %xmm3, %xmm3\n cvtsi2ss %r11, %xmm3\n movss %xmm3, %xmm4\n movss %xmm3, %xmm5\n movss %xmm3, %xmm6";
string postLoadInstr = " cvtsi2ss %rdi, %xmm1";
string[] dependentStores = new string[4];
dependentStores[0] = " movss %xmm1, (%r8)";
dependentStores[1] = " movss %xmm1, (%r8, %r14, 4)";
dependentStores[2] = " movss %xmm1, (%r8, %r13, 4)";
dependentStores[3] = " movss %xmm1, (%r8, %r12, 4)";
string[] indepFpInstrs = new string[4];
indepFpInstrs[0] = " addss %xmm2, %xmm3";
indepFpInstrs[1] = " addss %xmm2, %xmm4";
indepFpInstrs[2] = " addss %xmm2, %xmm5";
indepFpInstrs[3] = " addss %xmm2, %xmm6";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr);
}
}
}
}
================================================
FILE: AsmGen/tests/IdrfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class IdrfTest : UarchTest
{
public IdrfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "idrf";
this.Description = "Immediate/Displacement Register File";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
const string dummyBranchTargetName = "idrftest_badtarget";
if (isa == IUarchTest.ISA.amd64)
{
const int storeCount = 40;
const int addCount = 130;
List testInstructions = new List();
int storeIdx = 0, addIdx = 0;
for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
{
if (addIdx < addCount)
{
string addInstr = " add $" + (i + 1) + ", %r" + (12 + (i % 4));
testInstructions.Add(addInstr);
addIdx++;
}
else if (storeIdx < storeCount)
{
string storeInstr = " mov %r11d, " + +(((i + 1) & 0xFF) * 4) + "(%r8)";
testInstructions.Add(storeInstr);
storeIdx++;
}
else
{
string branchInstr = $" test %r11, %r11\n je {dummyBranchTargetName}";
testInstructions.Add(branchInstr);
}
}
string[] unrolledAdds = testInstructions.ToArray();
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
sb.AppendLine($"{dummyBranchTargetName}:\n int3");
}
}
}
}
================================================
FILE: AsmGen/tests/IndirectBranchTest.cs
================================================
using System.Text;
using System.IO;
namespace AsmGen
{
public class IndirectBranchTest : IUarchTest
{
private int[] branchCounts;
private int[] targetCounts;
private int globalHistoryAssistBits;
private bool assists;
public IndirectBranchTest(bool assist)
{
Prefix = "indirectbranch";
Description = "Indirect branch prediction";
FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch";
DivideTimeByCount = true;
branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
targetCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 192, 256, 384, 512 };
globalHistoryAssistBits = 4;
this.assists = assist;
}
public bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86GccAsm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
else if (isa == IUarchTest.ISA.mips64)
{
GenerateMipsAsm(sb);
}
}
private string GetFunctionName(int branchCount, int targetCount)
{
return Prefix + branchCount + "targets" + targetCount;
}
private string GetTargetLabelName(int branchCount, int targetCount, int branchIndex, int targetIndex)
{
return GetFunctionName(branchCount, targetCount) + "branch" + branchIndex + "target" + targetIndex;
}
public void GenerateArmAsm(StringBuilder sb)
{
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
{
int currentTargetCount = targetCounts[targetCountIdx];
for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
{
int currentBranchCount = branchCounts[branchCountIdx];
string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + functionLabel + ":");
sb.AppendLine(" sub sp, sp, #0x60");
sb.AppendLine(" stp x17, x18, [sp, #0x40]");
sb.AppendLine(" stp x9, x10, [sp, #0x40]");
sb.AppendLine(" stp x11, x12, [sp, #0x30]");
sb.AppendLine(" stp x15, x16, [sp, #0x20]");
sb.AppendLine(" stp x13, x14, [sp, #0x10]");
sb.AppendLine(" eor x16, x16, x16");
sb.AppendLine(" eor x15, x15, x15");
sb.AppendLine(" eor x14, x14, x14");
sb.AppendLine(" eor x12, x12, x12");
sb.AppendLine(" eor x11, x11, x11");
// fill in jump tables for every branch. there has to be a better way to do this
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
// x3 = array of ptrs to jump tables
// x14 = index into array of jump tables
// x17 = ptr to jump table
sb.AppendLine(" ldr x17, [x3, w14, uxtw #3]");
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
// assuming 64-bit pointers and 4K page size
// use x16 = label index
string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
sb.AppendLine($" adrp x10, {targetLabelName}");
sb.AppendLine($" add x10, x10, :lo12:{targetLabelName}");
sb.AppendLine(" str x10, [x17, w16, uxtw #3]");
sb.AppendLine(" add w16, w16, 1");
}
sb.AppendLine(" eor x16, x16, x16");
sb.AppendLine(" add w14, w14, 1");
}
// w14 = branch index, w16 = pattern (target) array index
sb.AppendLine(loopLabel + ":");
sb.AppendLine(" eor w14, w14, w14");
// generate branch blocks
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
// get a pointer to the jump table
sb.AppendLine(" ldr x9, [x3, w14, uxtw #3]");
// look up which target to jump to
sb.AppendLine(" ldr x15, [x1, w14, uxtw #3]");
sb.AppendLine(" add w14, w14, 1");
sb.AppendLine(" ldr w13, [x15, w16, uxtw #2]");
// use the target index (w13) to index into the jump table, and branch on it
sb.AppendLine(" ldr x17, [x9, w13, uxtw #3]");
// global history assist branches
// rax = index into jump table. make that correlate with global history
if (this.assists)
{
sb.AppendLine(" mov x18, 1");
sb.AppendLine(" eor w12, w12, w12");
for (int eaxBits = 0; eaxBits < globalHistoryAssistBits; eaxBits++)
{
string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits;
sb.AppendLine(" and w12, w13, w18");
sb.AppendLine($" cbnz w12, {targetName}");
sb.AppendLine(" nop");
sb.AppendLine($"{targetName}:");
sb.AppendLine(" lsl w18, w18, 1");
}
}
// branch on value of x17
sb.AppendLine($" br x17");
sb.AppendLine(" nop");
// generate targets
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
sb.AppendLine($" nop");
}
}
// increment w16, and basically cmov 0 -> w16 if w16 = list length
sb.AppendLine(" add w16, w16, 1");
sb.AppendLine(" cmp w16, w2");
sb.AppendLine(" csel w16, w11, w16, EQ");
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine($" cbnz x0, {loopLabel}");
sb.AppendLine(" mov x0, x12");
sb.AppendLine(" ldp x9, x10, [sp, #0x40]");
sb.AppendLine(" ldp x11, x12, [sp, #0x30]");
sb.AppendLine(" ldp x15, x16, [sp, #0x20]");
sb.AppendLine(" ldp x13, x14, [sp, #0x10]");
sb.AppendLine(" ldp x17, x18, [sp, #0x40]");
sb.AppendLine(" add sp, sp, #0x60");
sb.AppendLine(" ret");
}
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
{
int currentTargetCount = targetCounts[targetCountIdx];
for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
{
/* rdi = iteration count
* rsi = array of target selection arrays, one for each branch
* rdx = length of pattern array
* rcx = array of jump tables, one for each branch
*/
int currentBranchCount = branchCounts[branchCountIdx];
string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
sb.AppendLine("\n" + functionLabel + ":");
sb.AppendLine(" push %rbx");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %r9");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" xor %rbx, %rbx");
sb.AppendLine(" xor %r8, %r8");
sb.AppendLine(" xor %r9, %r9");
// initialize jump table
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
// rcx = array of ptrs to jump tables
// r9 = index into array of jump tables
// r15 = ptr to jump table
// load jump table base address into r15
sb.AppendLine(" mov (%rcx,%r9,8), %r15");
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
// assuming 64-bit pointers and 4K page size
// use rbx = index into
string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
sb.AppendLine($" lea {targetLabelName}(%rip), %rax");
sb.AppendLine($" mov %rax, (%r15,%rbx,8)");
sb.AppendLine(" inc %rbx");
}
sb.AppendLine(" xor %rbx, %rbx");
sb.AppendLine(" inc %r9");
}
sb.AppendLine(" xor %r8, %r8");
sb.AppendLine(" xor %r9, %r9");
string loopLabel = functionLabel + "_loop";
sb.AppendLine("\n" + loopLabel + ":");
sb.AppendLine(" xor %r11, %r11"); // set index into arr of arrs to 0
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
sb.AppendLine(" mov (%rcx,%r11,8), %r15"); // load jump table base pointer into r15
sb.AppendLine(" mov (%rsi,%r11,8), %r10"); // load target select array base pointer into r10
sb.AppendLine(" inc %r11");
sb.AppendLine(" mov (%r10,%rbx,4), %eax"); // get the target for the current iteration into eax
sb.AppendLine(" mov (%r15,%rax,8), %r14"); // load address of jump target from jump table
if (assists)
{
sb.AppendLine(" mov %rsi, %r13");
sb.AppendLine(" mov $1, %rsi");
for (int eaxBits = 0; eaxBits < 7; eaxBits++)
{
string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits;
sb.AppendLine(" test %eax, %esi");
sb.AppendLine($" jnz {targetName}");
sb.AppendLine(" nop");
sb.AppendLine($"{targetName}:");
sb.AppendLine(" shl $1, %esi");
}
}
sb.AppendLine(" mov %r13, %rsi");
sb.AppendLine(" jmp *%r14"); // and jump to it
// generate targets
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
sb.AppendLine($" nop");
}
}
// loop around in pattern history test array if necessary
// avoiding an extra branch to not pollute BPU history
sb.AppendLine(" inc %rbx");
sb.AppendLine(" cmp %rbx, %rdx");
sb.AppendLine(" cmove %r9, %rbx");
// end of main loop over iteration count
sb.AppendLine(" dec %rdi");
sb.AppendLine(" jnz " + loopLabel);
// function epilogue
sb.AppendLine(" mov %r8, %rax");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r9");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %rbx");
sb.AppendLine(" ret");
}
}
}
public void GenerateMipsAsm(StringBuilder sb)
{
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
{
int currentTargetCount = targetCounts[targetCountIdx];
for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
{
/* r4 = iteration count
* r5 = array of target selection arrays, one for each branch
* r6 = length of pattern array
* r7 = array of jump tables, one for each branch
*/
int currentBranchCount = branchCounts[branchCountIdx];
string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount);
sb.AppendLine("\n" + functionLabel + ":");
// initialize jump tables. r12-r20 are temporary regs.
sb.AppendLine(" move $r13, $r7"); // use r13 to access array of pointers to jump tables
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
sb.AppendLine(" ld.d $r15, $r13, 0"); // load address of branch's jump table into r15
// initialize the jump table. r15 = base addr. rely on C# for bounds :)
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
// write label addresses into array
string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx);
sb.AppendLine(" la $r16, " + targetLabelName); // load branch target address into r16
sb.AppendLine(" st.d $r16, $r15, 0"); // store branch target address
sb.AppendLine(" addi.d $r15, $r15, 8"); // increment array pointer
}
sb.AppendLine(" addi.d $r13, $r13, 8"); // increment array pointer for array of pointers to jump tables
}
// loop through branches for (iterations) times
string loopLabel = functionLabel + "_loop";
sb.AppendLine(" move $r14, $r0"); // r14 = branch target index
sb.AppendLine(" move $r17, $r0");
sb.AppendLine(" addi.d $r17, $r17, 1"); // use r17 just to store 1
sb.AppendLine("\n" + loopLabel + ":");
sb.AppendLine(" move $r12, $r5"); // r12 to hold pointer to target selection array
sb.AppendLine(" move $r13, $r7"); // r13 to hold pointer to jump target array
for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++)
{
sb.AppendLine(" ld.d $r16, $r12, 0"); // r16 = base address of target select array
sb.AppendLine(" ld.d $r18, $r13, 0"); // r18 = base address of jump target array
// target select array[target index]
sb.AppendLine(" alsl.d $r15, $r14, $r0, 0x2");
sb.AppendLine(" add.d $r15, $r15, $r16");
sb.AppendLine(" ld.w $r19, $r15, 0"); // load 32-bit target index
sb.AppendLine(" alsl.d $r15, $r19, $r0, 0x3"); // now index into jump table
sb.AppendLine(" add.d $r15, $r18, $r15");
sb.AppendLine(" ld.d $r20, $r15, 0");
// increment pointers for next branch
sb.AppendLine(" addi.d $r12, $r12, 8");
sb.AppendLine(" addi.d $r13, $r13, 8");
sb.AppendLine(" jr $r20");
// generate targets
for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++)
{
sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":");
sb.AppendLine($" nop");
}
}
// loop back. and try to reset branch index without a branch
sb.AppendLine(" addi.d $r14, $r14, 1"); // if r14 == r6 (pattern array length), set r14 back to 0 somehow
sb.AppendLine(" sub.d $r12, $r14, $r6"); // 12 = temporary result of comparison
sb.AppendLine(" maskeqz $r14, $r14, $r12"); // if r12 = 0, set r14 to 0. otherwise use current value
sb.AppendLine(" sub.d $r4, $r4, $r17");
sb.AppendLine(" bnez $r4, " + loopLabel);
sb.AppendLine(" jr $r1");
}
}
}
// kinda hack this to put in initialization code we need
public void GenerateExternLines(StringBuilder sb)
{
for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
sb.AppendLine("extern uint64_t " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]) + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));");
GenerateInitializationCode(sb);
string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccIndirectBranchFunction.c"));
sb.AppendLine(gccFunction);
}
public void GenerateInitializationCode(StringBuilder sb)
{
sb.AppendLine($"uint32_t maxIndirectBranchCount = {branchCounts.Length};");
sb.Append($"uint32_t indirectBranchCounts[{branchCounts.Length}] = ");
sb.Append("{ " + branchCounts[0]);
for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]);
sb.AppendLine(" };");
sb.Append($"uint32_t indirectBranchTargetCounts[{targetCounts.Length}] = ");
sb.Append("{ " + targetCounts[0]);
for (int i = 1; i < targetCounts.Length; i++) sb.Append(", " + targetCounts[i]);
sb.AppendLine(" };");
// TODO: need to make this a 2D array - [branch count][target count]
sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *indirectBranchTestFuncArr[{branchCounts.Length}][{targetCounts.Length}])({FunctionDefinitionParameters});");
sb.AppendLine("void initializeIndirectBranchFuncArr() {");
for (int i = 0; i < branchCounts.Length; i++)
{
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
{
sb.AppendLine($" indirectBranchTestFuncArr[{i}][{targetCountIdx}] = {GetFunctionName(branchCounts[i], targetCounts[targetCountIdx])};");
}
}
sb.AppendLine("}");
}
public string Prefix { get; set; }
public string Description { get; set; }
public int[] Counts;
public string FunctionDefinitionParameters { get; set; }
public string GetFunctionCallParameters { get; set; }
public bool DivideTimeByCount { get; set; }
public void GenerateAsmGlobalLines(StringBuilder sb)
{
for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++)
for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++)
sb.AppendLine(".global " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]));
}
public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa)
{
sb.AppendLine(" if (argc > 1 && strncmp(test_name, \"" + Prefix + "\", " + Prefix.Length + ") == 0) {");
sb.AppendLine(" printf(\"" + Description + ":\\n\");");
string ibMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "IndirectBranchTestBlock.c"));
sb.AppendLine(ibMain);
sb.AppendLine(" }\n");
}
}
}
================================================
FILE: AsmGen/tests/IntRfDepStoreTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class IntRfTestDependentStore : UarchTest
{
public IntRfTestDependentStore(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "intrfds";
this.Description = "Integer Register File, preceded by a dependent store";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add %r11, %r15";
unrolledAdds[1] = " add %r11, %r14";
unrolledAdds[2] = " add %r11, %r13";
unrolledAdds[3] = " add %r11, %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = "str w15, [x2, w25, uxtw #2]";
string postLoadInstrs2 = "str w15, [x2, w26, uxtw #2]";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x15, x15, x11";
unrolledAdds[1] = " add x14, x14, x11";
unrolledAdds[2] = " add x13, x13, x11";
unrolledAdds[3] = " add x12, x12, x11";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add.d $r15, $r15, $r14";
unrolledAdds[1] = " add.d $r16, $r16, $r14";
unrolledAdds[2] = " add.d $r17, $r17, $r14";
unrolledAdds[3] = " add.d $r18, $r18, $r14";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x28, x28, x29";
unrolledAdds[1] = " add x30, x30, x29";
unrolledAdds[2] = " add x31, x31, x29";
unrolledAdds[3] = " add x18, x18, x29";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
}
}
}
================================================
FILE: AsmGen/tests/IntRfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class IntRfTest : UarchTest
{
private bool initialDependentBranch;
public IntRfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "intrf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Integer Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add %r11, %r15";
unrolledAdds[1] = " add %r11, %r14";
unrolledAdds[2] = " add %r11, %r13";
unrolledAdds[3] = " add %r11, %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x15, x15, x11";
unrolledAdds[1] = " add x14, x14, x11";
unrolledAdds[2] = " add x13, x13, x11";
unrolledAdds[3] = " add x12, x12, x11";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add.d $r15, $r15, $r14";
unrolledAdds[1] = " add.d $r16, $r16, $r14";
unrolledAdds[2] = " add.d $r17, $r17, $r14";
unrolledAdds[3] = " add.d $r18, $r18, $r14";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add x28, x28, x29";
unrolledAdds[1] = " add x30, x30, x29";
unrolledAdds[2] = " add x31, x31, x29";
unrolledAdds[3] = " add x18, x18, x29";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/JsCvtNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class JsCvtNsq : UarchTest
{
private int totalOps;
public JsCvtNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "jscvtnsq";
this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
string initInstrs = " ldr d15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " fjcvtzs w15, d16";
depInstrs[1] = " fjcvtzs w14, d16";
depInstrs[2] = " fjcvtzs w13, d16";
depInstrs[3] = " fjcvtzs w12, d16";
string[] indepInstrs = new string[4];
indepInstrs[0] = " fjcvtzs w15, d15";
indepInstrs[1] = " fjcvtzs w14, d15";
indepInstrs[2] = " fjcvtzs w13, d15";
indepInstrs[3] = " fjcvtzs w12, d15";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/JsCvtSched.cs
================================================
using System.Text;
namespace AsmGen
{
public class JsCvtSched : UarchTest
{
public JsCvtSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "jscvtsched";
this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]";
string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]";
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " fjcvtzs w15, d16";
unrolledInstrs[1] = " fjcvtzs w14, d16";
unrolledInstrs[2] = " fjcvtzs w13, d16";
unrolledInstrs[3] = " fjcvtzs w12, d16";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/JumpNsqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class JumpNsqTest : UarchTest
{
public JumpNsqTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "jumpnsq";
this.Description = "Scheduler, Not-Taken Jumps, excluding possible nsq";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
// if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentJumps = new string[1];
dependentJumps[0] = " cmp %rdi, %rsi\n je jumpnsq_reallybadthing";
string[] independentJumps = new string[1];
independentJumps[0] = " cmp %r13, %r14\n je jumpnsq_reallybadthing";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps);
sb.AppendLine("jumpnsq_reallybadthing:\n int3");
}
}
}
}
================================================
FILE: AsmGen/tests/JumpSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class JumpSchedTest : UarchTest
{
public JumpSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "jumpsched";
this.Description = "Scheduler, Not-Taken Jumps";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledJumps = new string[1];
unrolledJumps[0] = " cmp %rdi, %rsi\n je jumpsched_reallybadthing";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("jumpsched_reallybadthing:\n int3");
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledJumps = new string[1];
unrolledJumps[0] = " cmp x25, x26\n b.eq jumpsched_reallybadthing";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("jumpsched_reallybadthing:\n .word 0xf7f0a000");
}
else if (isa == IUarchTest.ISA.riscv)
{
// todo
string[] unrolledJumps = new string[1];
unrolledJumps[0] = " beq x5, x6, jumpsched_reallybadthing";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false);
sb.AppendLine("jumpsched_reallybadthing:\n .word 0x00000000");
}
}
}
}
================================================
FILE: AsmGen/tests/LdqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class LdqTest : UarchTest
{
bool initialDependentBranch;
public LdqTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "ldq" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Load Queue" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledLoads = new string[4];
unrolledLoads[0] = " mov (%r8), %r15";
unrolledLoads[1] = " mov (%r8), %r14";
unrolledLoads[2] = " mov (%r8), %r13";
unrolledLoads[3] = " mov (%r8), %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstr = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] unrolledLoads = new string[4];
unrolledLoads[0] = " ldr x15, [x2]";
unrolledLoads[1] = " ldr x14, [x2]";
unrolledLoads[2] = " ldr x13, [x2]";
unrolledLoads[3] = " ldr x12, [x2]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledLoads = new string[4];
unrolledLoads[0] = " ld.d $r15, $r6, 0";
unrolledLoads[1] = " ld.d $r16, $r6, 8";
unrolledLoads[2] = " ld.d $r17, $r6, 16";
unrolledLoads[3] = " ld.d $r18, $r6, 24";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
string[] unrolledLoads = new string[4];
unrolledLoads[0] = " ld x28, (x11)";
unrolledLoads[1] = " ld x29, 8(x11)";
unrolledLoads[2] = " ld x30, 16(x11)";
unrolledLoads[3] = " ld x31, 24(x11)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads,
includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/LeaSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class LeaSchedTest : UarchTest
{
public LeaSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "leasched";
this.Description = "Scheduler, lea with base + index + offset";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " lea 128(%r15, %rdi), %r15";
unrolledAdds[1] = " lea 128(%r14, %rdi), %r14";
unrolledAdds[2] = " lea 128(%r13, %rdi), %r13";
unrolledAdds[3] = " lea 128(%r12, %rdi), %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
}
}
}
}
================================================
FILE: AsmGen/tests/LoadNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class LoadNsq : UarchTest
{
public LoadNsq(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "loadnsq";
this.Description = "Load Address Scheduler, Excluding any NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dep = new string[3];
dep[0] = " mov (%r8, %rdi, 4), %r15";
dep[1] = " mov (%r8, %rdi, 4), %r14";
dep[2] = " mov (%r8, %rdi, 4), %r13";
string[] indep = new string[3];
indep[0] = " mov (%r8), %r15";
indep[1] = " mov (%r8), %r14";
indep[2] = " mov (%r8), %r13";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true);
}
if (isa == IUarchTest.ISA.aarch64)
{
string[] dep = new string[3];
dep[0] = " ldr w15, [x2, w25, uxtw #2]";
dep[1] = " ldr w14, [x2, w25, uxtw #2]";
dep[2] = " ldr w13, [x2, w25, uxtw #2]";
string[] indep = new string[3];
indep[0] = " ldr w12, [x2]";
indep[1] = " ldr w11, [x2]";
indep[2] = " ldr w10, [x2]";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep);
}
}
}
}
================================================
FILE: AsmGen/tests/LoadSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class LoadSchedTest : UarchTest
{
public LoadSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "loadsched";
this.Description = "Load Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " mov (%r8, %rdi, 4), %r15";
dependentLoads[1] = " mov (%r8, %rdi, 4), %r14";
dependentLoads[2] = " mov (%r8, %rdi, 4), %r13";
dependentLoads[3] = " mov (%r8, %rdi, 4), %r12";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " mov (%r8, %rsi, 4), %r15";
dependentLoads1[1] = " mov (%r8, %rsi, 4), %r14";
dependentLoads1[2] = " mov (%r8, %rsi, 4), %r13";
dependentLoads1[3] = " mov (%r8, %rsi, 4), %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " ldr w15, [x2, w25, uxtw #2]";
dependentLoads[1] = " ldr w14, [x2, w25, uxtw #2]";
dependentLoads[2] = " ldr w13, [x2, w25, uxtw #2]";
dependentLoads[3] = " ldr w12, [x2, w25, uxtw #2]";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " ldr w15, [x2, w26, uxtw #2]";
dependentLoads1[1] = " ldr w14, [x2, w26, uxtw #2]";
dependentLoads1[2] = " ldr w13, [x2, w26, uxtw #2]";
dependentLoads1[3] = " ldr w12, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.mips64)
{
string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6";
string[] dependentLoads = new string[4];
dependentLoads[0] = " ld.d $r15, $r19, 0";
dependentLoads[1] = " ld.d $r16, $r19, 8";
dependentLoads[2] = " ld.d $r17, $r19, 12";
dependentLoads[3] = " ld.d $r18, $r19, 16";
string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
else if (isa == IUarchTest.ISA.riscv)
{
// x5 and x6 are pointer chasing loads
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12";
string[] dependentLoads = new string[4];
dependentLoads[0] = " ld x28, (x7)";
dependentLoads[1] = " ld x29, 8(x7)";
dependentLoads[2] = " ld x30, 16(x7)";
dependentLoads[3] = " ld x31, 24(x7)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/MaddSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MaddSchedTest : UarchTest
{
public MaddSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "maddsched";
this.Description = "Scheduler, Integer Multiply-Add";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledMuls = new string[4];
unrolledMuls[0] = " madd x15, x15, x25, x10";
unrolledMuls[1] = " madd x14, x14, x25, x10";
unrolledMuls[2] = " madd x13, x13, x25, x10";
unrolledMuls[3] = " madd x12, x12, x25, x10";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
}
}
}
}
================================================
FILE: AsmGen/tests/MaskRfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MaskRfTest : UarchTest
{
public MaskRfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "maskrf";
this.Description = "Mask Registers - AVX-512 only";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " kaddb %k0, %k1, %k1";
unrolledAdds[1] = " kaddb %k0, %k2, %k2";
unrolledAdds[2] = " kaddb %k0, %k3, %k3";
unrolledAdds[3] = " kaddb %k0, %k4, %k4";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false);
}
}
}
}
================================================
FILE: AsmGen/tests/MixAddJumpSched.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixAddJumpSchedTest : UarchTest
{
public MixAddJumpSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixaddjumpsched";
this.Description = "Scheduler, Mixed Adds and Not-Taken Jumps in 3:1 ratio";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledJumps = new string[4];
unrolledJumps[0] = " cmp %rdi, %rsi\n je mixaddjumpsched_reallybadthing";
unrolledJumps[1] = " add %rsi, %r15";
unrolledJumps[2] = " add %rsi, %r14";
unrolledJumps[3] = " add %rsi, %r14";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("mixaddjumpsched_reallybadthing:\n int3");
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledJumps = new string[4];
unrolledJumps[0] = " cmp x25, x26\n b.eq mixaddjumpsched_reallybadthing";
unrolledJumps[1] = " add x15, x15, x25";
unrolledJumps[2] = " add x14, x14, x25";
unrolledJumps[3] = " add x14, x14, x25";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("mixaddjumpsched_reallybadthing:\n .word 0xf7f0a000");
}
else if (isa == IUarchTest.ISA.riscv)
{
// todo
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " mul x30, x30, x5";
unrolledAdds[1] = " mul x29, x29, x5";
unrolledAdds[2] = " mul x28, x28, x5";
unrolledAdds[3] = " mul x31, x31, x5";
string[] unrolledAdds1 = new string[4];
unrolledAdds1[0] = " mul x30, x30, x6";
unrolledAdds1[1] = " mul x31, x31, x6";
unrolledAdds1[2] = " mul x28, x28, x6";
unrolledAdds1[3] = " mul x29, x29, x6";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
}
}
}
}
================================================
FILE: AsmGen/tests/MixAddvJsCvtNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixAddvJsCvtNsq : UarchTest
{
public MixAddvJsCvtNsq(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixaddvjscvtnsq";
this.Description = "ADDV and fjcvtzs Scheduler, Excluding any NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]";
string initInstrs = " ldr q17, [x2]\n ldr d15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " addv h1, v16.4h";
depInstrs[1] = " fjcvtzs w15, d2";
depInstrs[2] = " addv h3, v16.4h";
depInstrs[3] = " fjcvtzs w14, d2";
string[] indepInstrs = new string[4];
indepInstrs[0] = " addv h4, v17.4h";
indepInstrs[1] = " fjcvtzs w12, d15";
indepInstrs[2] = " addv h5, v17.4h";
indepInstrs[3] = " fjcvtzs w13, d15";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/MixAddvJsCvtSched.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixAddvJsCvtSched : UarchTest
{
public MixAddvJsCvtSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixaddvjscvtsched";
this.Description = "ADDV and fjcvtzs Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]";
string postLoadInstrs2 = " ldr q16, [x2, w26, sxtw #0]\n ldr d2, [x2, w26, sxtw #0]";
string[] unrolledInstrs = new string[4];
unrolledInstrs[0] = " addv h1, v16.4h";
unrolledInstrs[1] = " fjcvtzs w15, d2";
unrolledInstrs[2] = " addv h3, v16.4h";
unrolledInstrs[3] = " fjcvtzs w14, d2";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/MixBranchStoreTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixBranchStoreTest : UarchTest
{
private bool mixNops;
private bool initialDependentBranch;
public MixBranchStoreTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixstqbob" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Mixed NT branches and stores" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ;
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.mixNops = mixNops;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
public void GenerateArmAsm(StringBuilder sb)
{
string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
for (int i = 0; i < Counts.Length; i++)
{
string funcName = Prefix + Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
sb.AppendLine(" mov x10, 6");
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
sb.AppendLine($" cmp x15, x10");
sb.AppendLine($" b.eq {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
sb.AppendLine($" cmp x15, x10");
sb.AppendLine($" b.eq {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
}
}
================================================
FILE: AsmGen/tests/MixFAdd256and32RfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixFAdd256and32RfTest : UarchTest
{
public MixFAdd256and32RfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "fadd256and32rf";
this.Description = "Mixed 32-bit scalar and 256-bit FP RF capacity";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vmovups (%r8), %ymm0\n" +
" movss (%r8), %xmm1\n" +
" vmovups %ymm0, %ymm2\n" +
" movss (%r8), %xmm3\n" +
" vmovups %ymm0, %ymm4\n" +
" movss (%r8), %xmm5\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1";
unrolledAdds[1] = " addss %xmm5, %xmm2";
unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3";
unrolledAdds[3] = " addss %xmm5, %xmm4";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
}
else if (isa == IUarchTest.ISA.mips64)
{
string initInstrs = "";
for (int regIdx = 0; regIdx < 32; regIdx++)
{
initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n";
initInstrs += " fld.s $f" + regIdx + ", $r6, " + regIdx * 4 + "\n";
}
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " xvfadd.s $xr1, $xr1, $xr1";
unrolledAdds[1] = " fadd.s $f11, $f11, $f11";
unrolledAdds[2] = " xvfadd.s $xr3, $xr3, $xr3";
unrolledAdds[3] = " fadd.s $f12, $f12, $f12";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/MixFpRfDepBranchTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixFpRfDepBranchTest : UarchTest
{
private int interval;
public MixFpRfDepBranchTest(int low, int high, int step, int interval)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixfprfdepbranch" + interval;
this.Description = "FP Register File, with some dependent branches";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *fpArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.interval = interval;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string initInstrs = " ldr s17, [x2]\n" +
" ldr s18, [x2, 4]\n" +
" ldr s19, [x2, 8]\n" +
" ldr s20, [x2, 12]\n" +
" ldr s21, [x2, 16]\n";
List unrolledAddsList = new List();
for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
{
int regnum = 18 + (i % 4);
unrolledAddsList.Add($" fadd s{regnum}, s{regnum}, s17");
if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixfpjumpsched_badthing" + interval);
}
string[] unrolledAdds = unrolledAddsList.ToArray();
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs);
sb.AppendLine($"mixfpjumpsched_badthing{interval}:\n .word 0xf7f0a000");
}
}
}
}
================================================
FILE: AsmGen/tests/MixFpVecRfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixFpVecRfTest : UarchTest
{
private bool initialDependentBranch;
public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixfpvecrf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Mixed FP/128-bit FP vec rf" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.riscv)
{
string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n fld f0, (a1)";
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
postLoadInstrs += "\n mv t6, a2";
string[] unrolledInstrs = new string[2];
unrolledInstrs[0] = " vfadd.vv v0, v0, v0";
unrolledInstrs[1] = " fadd.s f0, f0, f0";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false,
initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/MixIntRfDepBranchTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixIntRfDepBranchTest : UarchTest
{
private int interval;
public MixIntRfDepBranchTest(int low, int high, int step, int interval)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixintrfdepbranch" + interval;
this.Description = "Integer Register File, with some dependent branches";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.interval = interval;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
List unrolledAddsList = new List();
for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++)
{
int regnum = 12 + (i % 4);
unrolledAddsList.Add($" add x{regnum}, x{regnum}, x11");
if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixintjumpsched_badthing" + interval);
}
string[] unrolledAdds = unrolledAddsList.ToArray();
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true);
sb.AppendLine($"mixintjumpsched_badthing{interval}:\n .word 0xf7f0a000");
}
}
}
}
================================================
FILE: AsmGen/tests/MixIntVec128RfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixIntVec128RfTest : UarchTest
{
private bool initialDependentBranch;
public MixIntVec128RfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixintvec128" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Mixed integer and 128-bit vector register file capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vmovups (%r8), %ymm0\n" +
" movss (%r8), %xmm1\n" +
" vmovups %ymm0, %ymm2\n" +
" movss (%r8), %xmm3\n" +
" vmovups %ymm0, %ymm4\n" +
" movss (%r8), %xmm5\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add %r11, %r15";
unrolledAdds[1] = " addss %xmm5, %xmm2";
unrolledAdds[2] = " add %r11, %r14";
unrolledAdds[3] = " addss %xmm5, %xmm4";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string initInstrs = " ldr q0, [x1]\n" +
" ldr q1, [x1, #0x10]\n" +
" ldr q2, [x1, #0x20]\n" +
" ldr q3, [x1, #0x30]\n" +
" ldr q4, [x1, #0x40]\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s";
unrolledAdds[1] = " add x15, x15, x11";
unrolledAdds[2] = " add v2.4s, v2.4s, v0.4s";
unrolledAdds[3] = " add x14, x14, x11";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/MixIntrfFprfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixIntFpRfTest : UarchTest
{
private bool initialDependentBranch;
public MixIntFpRfTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixintfprf" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Mixed INT/FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
//if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
//if (isa == IUarchTest.ISA.amd64) return true;
//if (isa == IUarchTest.ISA.aarch64) return true;
//if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// todo
string initInstrs = " movss (%r8), %xmm1\n" +
" movss 4(%r8), %xmm2\n" +
" movss 8(%r8), %xmm3\n" +
" movss 12(%r8), %xmm4\n" +
" movss 16(%r8), %xmm5\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " addss %xmm1, %xmm2";
unrolledAdds[1] = " addss %xmm1, %xmm3";
unrolledAdds[2] = " addss %xmm1, %xmm4";
unrolledAdds[3] = " addss %xmm1, %xmm5";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{// todo
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string initInstrs = " ldr s17, [x2]\n" +
" ldr s18, [x2, 4]\n" +
" ldr s19, [x2, 8]\n" +
" ldr s20, [x2, 12]\n" +
" ldr s21, [x2, 16]\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd s18, s18, s17";
unrolledAdds[1] = " fadd s19, s19, s17";
unrolledAdds[2] = " fadd s20, s20, s17";
unrolledAdds[3] = " fadd s21, s21, s17";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{// todo
string initInstrs = " fld.s $f8, $r6, 0\n" +
" fld.s $f9, $r6, 4\n" +
" fld.s $f10, $r6, 8\n" +
" fld.s $f11, $r6, 12\n" +
" fld.s $f12, $r6, 16\n";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " fadd.s $f9, $f9, $f8";
unrolledAdds[1] = " fadd.s $f10, $f10, $f8";
unrolledAdds[2] = " fadd.s $f11, $f11, $f8";
unrolledAdds[3] = " fadd.s $f12, $f12, $f8";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs);
}
else if (isa == IUarchTest.ISA.riscv)
{
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
string initInstrs = " fld f0, (x12)\n" +
" fld f1, 8(x12)\n" +
" fld f2, 16(x12)\n" +
" fld f3, 24(x12)\n" +
" fld f4, 32(x12)\n";
List unrolledAdds = new List();
/* for C910 */
for (int i = 0; i < 30; i++) unrolledAdds.Add($" fadd.s f{i % 4}, f{i % 4}, f4");
for (int i = 0; i < 200; i++) unrolledAdds.Add($" add x28, x28, x29");
/*unrolledAdds.Add(" fadd.s f0, f0, f4");
unrolledAdds.Add(" add x28, x28, x29");
unrolledAdds.Add(" fadd.s f1, f1, f4");
unrolledAdds.Add(" add x30, x30, x29");
unrolledAdds.Add(" fadd.s f2, f2, f4");
unrolledAdds.Add(" add x31, x31, x29");
unrolledAdds.Add(" fadd.s f3, f3, f4");
unrolledAdds.Add(" add x18, x18, x29");*/
string[] unrolledAddsArr = unrolledAdds.ToArray();
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAddsArr, unrolledAddsArr,
includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/MixJumpStoreDataSched.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixJumpStoreDataSched : UarchTest
{
public MixJumpStoreDataSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixjumpstoredatasched";
this.Description = "Scheduler, Mixed Jumps and Store Data";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
//if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledJumps = new string[4];
unrolledJumps[0] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing";
unrolledJumps[1] = " mov %rdi, (%r8)";
unrolledJumps[2] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing";
unrolledJumps[3] = " mov %rdi, 64(%r8)";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("mixjumpstoredatasched_reallybadthing:\n int3");
}
}
}
}
================================================
FILE: AsmGen/tests/MixJumpStoreSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixJumpStoreSchedTest : UarchTest
{
public MixJumpStoreSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixjumpstoresched";
this.Description = "Scheduler, Mixed Jumps and Stores (Address Dependency)";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
//if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledJumps = new string[4];
unrolledJumps[0] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing";
unrolledJumps[1] = " mov %r14, (%r8, %rdi, 2)";
unrolledJumps[2] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing";
unrolledJumps[3] = " mov %r14, 64(%r8, %rdi, 2)";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true);
sb.AppendLine("mixstorejumpsched_reallybadthing:\n int3");
}
}
}
}
================================================
FILE: AsmGen/tests/MixJumpThenAddSched.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixJumpThenAddSched : UarchTest
{
public MixJumpThenAddSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixjumpthenaddsched";
this.Description = "Scheduler, 40 NT jumps + adds";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
// if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
List unrolledJumps = new List();
int instrIdx;
for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add(" cmp x25, x26\n b.eq mixaddthenjumpsched_reallybadthing");
for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add(" add x15, x15, x25");
string[] instrs = unrolledJumps.ToArray();
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true);
sb.AppendLine("mixaddthenjumpsched_reallybadthing:\n .word 0xf7f0a000");
}
}
}
}
================================================
FILE: AsmGen/tests/MixLdqStqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixLdqStqTest : UarchTest
{
private bool initialDependentBranch;
public MixLdqStqTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixldqstq" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Mixed Load/Store Queue Test (mem ops pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr1";
this.GetFunctionCallParameters = "structIterations, A, B";
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86GccAsm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
string[] instrs = new string[4];
instrs[0] = " mov %r15, (%r8)";
instrs[1] = " mov (%rdx), %r14";
instrs[2] = " mov %r13, (%r8)";
instrs[3] = " mov (%rdx), %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, true);
}
public void GenerateArmAsm(StringBuilder sb)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] instrs = new string[4];
instrs[0] = " str x15, [x2]";
instrs[1] = " ldr x14, [x1]";
instrs[2] = " str x13, [x2]";
instrs[3] = " ldr x12, [x1]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, instrs, instrs, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
}
================================================
FILE: AsmGen/tests/MixLoadStoreDivSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixLoadStoreDivSchedTest : UarchTest
{
public MixLoadStoreDivSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixloadstoredivsched";
this.Description = "Load/Store Scheduler Capacity Test, using divs to block retirement";
this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2, int *arr3";
this.GetFunctionCallParameters = "structIterations, list_size, B, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86Asm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
}
public void GenerateX86Asm(StringBuilder sb)
{
string[] dependentLoads = new string[2];
dependentLoads[0] = " mov (%r9, %rdx, 4), %r15";
dependentLoads[1] = " mov %r14, (%r8, %rdx, 4)";
UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false);
}
public void GenerateArmAsm(StringBuilder sb)
{
string[] dependentLoads = new string[2];
dependentLoads[0] = " ldr w15, [x3, w25, uxtw #2]";
dependentLoads[1] = " str w14, [x2, w25, uxtw #2]";
string[] dependentLoads1 = new string[2];
dependentLoads1[0] = " ldr w15, [x3, w26, uxtw #2]";
dependentLoads1[1] = " str w14, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false);
}
}
}
================================================
FILE: AsmGen/tests/MixLoadStoreSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixLoadStoreSched : UarchTest
{
public MixLoadStoreSched(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixloadstoresched";
this.Description = "Mixed Load/Store Address Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " mov %r15, (%r8, %rdi, 4)";
dependentLoads[1] = " mov (%r8, %rdi, 2), %r14";
dependentLoads[2] = " mov %r13, (%r8, %rdi, 4)";
dependentLoads[3] = " mov (%r8, %rdi, 2), %r12";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " mov %r15, (%r8, %rsi, 4)";
dependentLoads1[1] = " mov (%r8, %rsi, 4), %r14";
dependentLoads1[2] = " mov %r13, (%r8, %rsi, 4)";
dependentLoads1[3] = " mov (%r8, %rsi, 4), %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " str w15, [x2, w25, uxtw #2]";
dependentLoads[1] = " ldr w14, [x1, w25, uxtw #0]";
dependentLoads[2] = " str w13, [x2, w25, uxtw #2]";
dependentLoads[3] = " ldr w12, [x1, w25, uxtw #0]";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " str w15, [x2, w26, uxtw #2]";
dependentLoads1[1] = " ldr w14, [x1, w26, uxtw #0]";
dependentLoads1[2] = " str w13, [x2, w26, uxtw #2]";
dependentLoads1[3] = " ldr w12, [x1, w26, uxtw #0]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
// x5 and x6 are pointer chasing loads
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12";
string[] dependentLoads = new string[4];
dependentLoads[0] = " sd x28, (a2)";
dependentLoads[1] = " ld x29, 8(a2)";
dependentLoads[2] = " sd x30, 16(a2)";
dependentLoads[3] = " ld x31, 24(a2)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/MixStoreDivSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MixStoreDivSchedTest : UarchTest
{
public MixStoreDivSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixstoresched";
this.Description = "Store (Mixed Data/Address) Scheduler Capacity Test";
this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2";
this.GetFunctionCallParameters = "structIterations, list_size, B";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86Asm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
}
public void GenerateX86Asm(StringBuilder sb)
{
string[] dependentStores = new string[4];
dependentStores[0] = " mov %rdx, (%r8, %r15, 4)";
dependentStores[1] = " mov %r15, (%r8, %rdx, 4)";
dependentStores[2] = " mov %rdx, (%r8, %r15, 4)";
dependentStores[3] = " mov %r15, (%r8, %rdx, 4)";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " mov %rdx, (%r8, %r11, 4)";
dependentStores1[1] = " mov %r11, (%r8, %rdx, 4)";
dependentStores1[2] = " mov %rdx, (%r8, %r11, 4)";
dependentStores1[3] = " mov %r11, (%r8, %rdx, 4)";
UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
}
public void GenerateArmAsm(StringBuilder sb)
{
string[] dependentStores = new string[4];
dependentStores[0] = " str w25, [x2, w15, uxtw #2]";
dependentStores[1] = " str w15, [x2, w25, uxtw #2]";
dependentStores[2] = " str w25, [x2, w15, uxtw #2]";
dependentStores[3] = " str w15, [x2, w25, uxtw #2]";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " str w26, [x2, w15, uxtw #2]";
dependentStores1[1] = " str w15, [x2, w26, uxtw #2]";
dependentStores1[2] = " str w26, [x2, w15, uxtw #2]";
dependentStores1[3] = " str w15, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
}
}
}
================================================
FILE: AsmGen/tests/MixVec512Vec256BlockRfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixVec512Vec256BlockRfTest : UarchTest
{
// number of tiny registers
private int nTiny;
public MixVec512Vec256BlockRfTest(int low, int high, int step, int nTiny)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixvec512vec256blockrf" + nTiny;
this.Description = $"Mixed zmm/ymm regs - AVX-512 only, {nTiny} 256-bit then 512-bit";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// use even numbered regs for ymm testing
string initInstrs = " vmovups (%r8), %zmm1\n" +
" vmovups 64(%r8), %ymm2\n" +
" vmovups 128(%r8), %zmm3\n" +
" vmovups 192(%r8), %ymm4\n" +
" vmovups 256(%r8), %zmm5\n";
// use all zmm regs
for (int i = 6; i < 32; i++)
{
if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n";
else initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
}
List instrsList = new List();
for (int i = 0; i < nTiny; i++)
{
int regNum = ((i & 1) == 0) ? i & 0x1F : (i + 1) & 0x1F;
instrsList.Add($" vxorps %ymm2, %ymm{regNum}, %ymm{regNum}");
}
for (int i = nTiny; i < this.Counts[this.Counts.Length - 1];i++)
{
int regNum = ((i & 1) == 0) ? i: (i + 1);
regNum = (regNum + 1) & 0x1F;
instrsList.Add($" vxorps %zmm1, %zmm{regNum}, %zmm{regNum}");
}
string[] unrolledAdds = instrsList.ToArray();
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/MixVec512Vec256RfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class MixVec512Vec256RfTest : UarchTest
{
public MixVec512Vec256RfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mixvec512vec256rf";
this.Description = "Mixed zmm/ymm regs - AVX-512 only, alternating";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// use even numbered regs for ymm testing
string initInstrs = " vmovups (%r8), %zmm1\n" +
" vmovups 64(%r8), %ymm2\n" +
" vmovups 128(%r8), %zmm3\n" +
" vmovups 192(%r8), %ymm4\n" +
" vmovups 256(%r8), %zmm5\n";
// use all zmm regs
for (int i = 6; i < 32; i++)
{
if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n";
else initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
}
List instrsList = new List();
for (int i = 1; i < 32; i++)
{
if ((i & 1) == 0) instrsList.Add($" vaddps %ymm2, %ymm{i}, %ymm{i}");
else instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}");
}
string[] unrolledAdds = instrsList.ToArray();
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/MmxRfTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MmxRfTest : UarchTest
{
public MmxRfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mmxrf";
this.Description = "64-bit MMX RF Capacity Test. x86 only";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr2";
this.GetFunctionCallParameters = "structIterations, A, B";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
}
public void GenerateX86GccAsm(StringBuilder sb)
{
string initInstrs =
" fsave (%r8)\n" +
" movq (%rdx), %mm0\n" +
" movq 8(%rdx), %mm1\n" +
" movq 16(%rdx), %mm2\n" +
" movq 24(%rdx), %mm3\n" +
" movq 32(%rdx), %mm4\n";
string cleanupInstrs = " frstor (%r8)";
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " paddw %mm0, %mm1";
unrolledAdds[1] = " paddw %mm0, %mm2";
unrolledAdds[2] = " paddw %mm0, %mm3";
unrolledAdds[3] = " paddw %mm0, %mm4";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs);
}
}
}
================================================
FILE: AsmGen/tests/MulSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class MulSchedTest : UarchTest
{
public MulSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "mulsched";
this.Description = "Scheduler, Integer Multiplies";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledMuls = new string[4];
unrolledMuls[0] = " imul %rdi, %r15";
unrolledMuls[1] = " imul %rdi, %r14";
unrolledMuls[2] = " imul %rdi, %r13";
unrolledMuls[3] = " imul %rdi, %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] unrolledMuls = new string[4];
unrolledMuls[0] = " mul x15, x15, x25";
unrolledMuls[1] = " mul x14, x14, x25";
unrolledMuls[2] = " mul x13, x13, x25";
unrolledMuls[3] = " mul x12, x12, x25";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " mul.d $r15, $r15, $r12";
unrolledAdds[1] = " mul.d $r16, $r16, $r12";
unrolledAdds[2] = " mul.d $r17, $r17, $r12";
unrolledAdds[3] = " mul.d $r18, $r18, $r12";
string[] unrolledAdds1 = new string[4];
unrolledAdds1[0] = " mul.d $r15, $r15, $r13";
unrolledAdds1[1] = " mul.d $r16, $r16, $r13";
unrolledAdds1[2] = " mul.d $r17, $r17, $r13";
unrolledAdds1[3] = " mul.d $r18, $r18, $r13";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
string[] unrolledMuls = new string[4];
unrolledMuls[0] = " mul x30, x30, x5";
unrolledMuls[1] = " mul x29, x29, x5";
unrolledMuls[2] = " mul x28, x28, x5";
unrolledMuls[3] = " mul x31, x31, x5";
string[] unrolledMuls1 = new string[4];
unrolledMuls1[0] = " mul x30, x30, x6";
unrolledMuls1[1] = " mul x31, x31, x6";
unrolledMuls1[2] = " mul x28, x28, x6";
unrolledMuls1[3] = " mul x29, x29, x6";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls1, false);
}
}
}
}
================================================
FILE: AsmGen/tests/NopLoopTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class NopLoopTest : UarchTest
{
///
///
///
/// must be greater than 2
///
///
public NopLoopTest(int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(3, high, step);
this.Prefix = "noploop";
this.Description = $"NOP throughput for various loop sizes";
this.FunctionDefinitionParameters = "uint64_t iterations";
this.GetFunctionCallParameters = "structIterations";
this.DivideTimeByCount = true;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return false;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
sb.AppendLine(funcName + ":");
// count dec, jnz as instructions in the loop
for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(" nop");
sb.AppendLine(" dec %rdi");
sb.AppendLine(" jnz " + funcName);
sb.AppendLine(" ret");
}
}
public void GenerateArmAsm(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
sb.AppendLine(funcName + ":");
// count dec, jnz as instructions in the loop
for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(" nop");
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName);
sb.AppendLine(" ret");
}
}
}
}
================================================
FILE: AsmGen/tests/PdepSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class PdepSchedTest : UarchTest
{
public PdepSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "pdepsched";
this.Description = "Scheduler, PDEP";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " pdep %rdi, %r15, %r15";
unrolledAdds[1] = " pdep %rdi, %r14, %r14";
unrolledAdds[2] = " pdep %rdi, %r13, %r13";
unrolledAdds[3] = " pdep %rdi, %r12, %r12";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false);
}
}
}
}
================================================
FILE: AsmGen/tests/ReturnStackTest.cs
================================================
using System;
using System.Text;
namespace AsmGen
{
public class ReturnStackTest : UarchTest
{
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public ReturnStackTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "returnstack";
this.Description = "Return Stack Depth Test";
this.FunctionDefinitionParameters = "uint64_t iterations";
this.GetFunctionCallParameters = "structIterations";
this.DivideTimeByCount = true;
}
private string GetFunctionName(int count, int depth) { return $"returnstack{count}_{depth}"; }
private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; }
public string GetLabelName(string funcName, int part) { return funcName + "part" + part; }
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86GccAsm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
else if (isa == IUarchTest.ISA.mips64)
{
GenerateMipsAsm(sb);
}
else if (isa == IUarchTest.ISA.riscv)
{
GenerateRiscvAsm(sb);
}
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
{
int callDepth = this.Counts[countIdx];
string topLevelFunctionLabel = this.Prefix + callDepth;
sb.AppendLine($"{topLevelFunctionLabel}:");
sb.AppendLine(" xor %rax, %rax");
sb.AppendLine($"{topLevelFunctionLabel}_loop:");
sb.AppendLine($" call " + GetFunctionName(callDepth, 0));
sb.AppendLine($" dec %rdi");
sb.AppendLine($" jne {topLevelFunctionLabel}_loop");
sb.AppendLine(" ret");
// generate a batch of functions so we aren't returning to the same address
// otherwise a simple predictor will suffice
for (int callIdx = 0; callIdx < callDepth; callIdx++)
{
string funcName = GetFunctionName(callDepth, callIdx);
sb.AppendLine($".global {funcName}");
sb.AppendLine(".align 128"); // https://github.com/clamchowder/Microbenchmarks/issues/14
sb.AppendLine($"{funcName}:");
if (callIdx < callDepth - 1)
{
sb.AppendLine($" add %rdi, %rax");
sb.AppendLine(" call " + GetFunctionName(callDepth, callIdx + 1));
}
sb.AppendLine(".align 128");
sb.AppendLine(" ret");
}
}
}
public void GenerateArmAsm(StringBuilder sb)
{
for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
{
int callDepth = this.Counts[countIdx];
string topLevelFunctionLabel = this.Prefix + callDepth;
sb.AppendLine($"{topLevelFunctionLabel}:");
sb.AppendLine(" sub sp, sp, #0x20");
sb.AppendLine(" stp x29, x30, [sp, #0x10]");
sb.AppendLine(" eor x3, x3, x3");
sb.AppendLine($"{topLevelFunctionLabel}_loop:");
sb.AppendLine($" bl " + GetFunctionName(callDepth, 0));
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine($" cbnz x0, {topLevelFunctionLabel}_loop");
sb.AppendLine(" ldp x29, x30, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x20");
sb.AppendLine(" ret");
for (int callIdx = 0; callIdx < callDepth; callIdx++)
{
string funcName = GetFunctionName(callDepth, callIdx);
sb.AppendLine($".global {funcName}");
sb.AppendLine($"{funcName}:");
sb.AppendLine($" add x3, x3, x0");
if (callIdx < callDepth - 1)
{
// 'bl' is like x86 'call', except it's like the kid that falls asleep in the middle of class
// it doesn't push the return address, so you have to do that yourself
sb.AppendLine(" sub sp, sp, #0x20");
sb.AppendLine(" stp x29, x30, [sp, #0x10]");
sb.AppendLine(" bl " + GetFunctionName(callDepth, callIdx + 1));
sb.AppendLine(" ldp x29, x30, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x20");
}
sb.AppendLine(" ret");
}
}
}
public void GenerateMipsAsm(StringBuilder sb)
{
for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
{
int callDepth = this.Counts[countIdx];
string topLevelFunctionLabel = this.Prefix + callDepth;
sb.AppendLine($"{topLevelFunctionLabel}:");
// top level function runs for specified number of iterations
sb.AppendLine(" xor $r12, $r12, $r12");
sb.AppendLine(" xor $r13, $r13, $r13");
sb.AppendLine(" addi.d $r12, $r12, 1");
sb.AppendLine(" addi.d $r13, $r13, 8");
sb.AppendLine(" sub.d $sp, $sp, $r13");
sb.AppendLine(" st.d $r1, $sp, 0");
sb.AppendLine($"{topLevelFunctionLabel}_loop:");
// mips stack grows down
sb.AppendLine($" bl " + GetFunctionName(callDepth, 0));
sb.AppendLine(" sub.d $r4, $r4, $r12");
sb.AppendLine($" bnez $r4, {topLevelFunctionLabel}_loop");
sb.AppendLine(" ld.d $r1, $sp, 0");
sb.AppendLine(" add.d $sp, $sp, $r13");
sb.AppendLine(" jr $r1");
// generate the dummy functions
for (int callIdx = 0; callIdx < callDepth; callIdx++)
{
string funcName = GetFunctionName(callDepth, callIdx);
sb.AppendLine($".global {funcName}");
sb.AppendLine($"{funcName}:");
if (callIdx < callDepth - 1)
{
sb.AppendLine(" sub.d $sp, $sp, $r13");
sb.AppendLine(" st.d $r1, $sp, 0"); // save return address
sb.AppendLine(" bl " + GetFunctionName(callDepth, callIdx + 1));
sb.AppendLine(" ld.d $r1, $sp, 0"); // load return address
sb.AppendLine(" add.d $sp, $sp, $r13");
}
sb.AppendLine(" jr $r1");
}
}
}
public void GenerateRiscvAsm(StringBuilder sb)
{
for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++)
{
int callDepth = this.Counts[countIdx];
string topLevelFunctionLabel = this.Prefix + callDepth;
sb.AppendLine($"{topLevelFunctionLabel}:");
// top level function runs for specified number of iterations
// iteration count in x10
sb.AppendLine(" addi sp, sp, -16");
sb.AppendLine(" sd ra, (sp)");
sb.AppendLine($"{topLevelFunctionLabel}_loop:");
sb.AppendLine($" jal " + GetFunctionName(callDepth, 0));
sb.AppendLine(" addi x10, x10, -1");
sb.AppendLine($" bge x10, x0, {topLevelFunctionLabel}_loop");
sb.AppendLine(" ld ra, (sp)");
sb.AppendLine(" addi sp, sp, 16");
sb.AppendLine(" ret");
// generate the dummy functions
for (int callIdx = 0; callIdx < callDepth; callIdx++)
{
string funcName = GetFunctionName(callDepth, callIdx);
sb.AppendLine($".global {funcName}");
sb.AppendLine($"{funcName}:");
if (callIdx < callDepth - 1)
{
sb.AppendLine(" addi sp, sp, -16"); // keep stack pointer 16B aligned even though we only save a 8B reg
sb.AppendLine(" sd ra, (sp)"); // save return address
sb.AppendLine(" jal " + GetFunctionName(callDepth, callIdx + 1));
sb.AppendLine(" ld ra, (sp)"); // load return address
sb.AppendLine(" addi sp, sp, 16");
}
sb.AppendLine(" ret");
}
}
}
}
}
================================================
FILE: AsmGen/tests/RobTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class RobTest : UarchTest
{
private string[] nops;
private bool initialDependentBranch;
public RobTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "rob" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Reorder Buffer Test" + (initialDependentBranch ? " preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.nops = new string[] { "nop" };
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/RorSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class RorSchedTest : UarchTest
{
public RorSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "rorsched";
this.Description = "Scheduler, Integer Rotate by Immediate (1)";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string postLoadInstrs = " mov %rdi, %r15";
string postLoadInstrs2 = " mov %rsi, %r15";
string[] unrolledInstrs = new string[1];
unrolledInstrs[0] = " ror $1, %r15";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
sb,
this.Counts,
this.Prefix,
unrolledInstrs,
unrolledInstrs,
postLoadInstrs1: postLoadInstrs,
postLoadInstrs2: postLoadInstrs2,
includePtrChasingLoads: false);
}
}
}
}
================================================
FILE: AsmGen/tests/ShlSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class ShlSchedTest : UarchTest
{
public ShlSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "shlsched";
this.Description = "Scheduler, Integer Shift by Immediate (1)";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string postLoadInstrs = " mov %rdi, %r15";
string postLoadInstrs2 = " mov %rsi, %r15";
string[] unrolledInstrs = new string[1];
unrolledInstrs[0] = " shl $1, %r15";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
sb,
this.Counts,
this.Prefix,
unrolledInstrs,
unrolledInstrs,
postLoadInstrs1: postLoadInstrs,
postLoadInstrs2: postLoadInstrs2,
includePtrChasingLoads: false);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreDataDivNsqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreDataDivNsqTest : UarchTest
{
public StoreDataDivNsqTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storedatadivnsq";
this.Description = "Store Data Scheduler, using DIVs to block retirement";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// idiv puts remainder in RDX
string[] dependentStores = new string[4];
dependentStores[0] = " mov %rdx, (%r8, %r15, 4)";
dependentStores[1] = " mov %rdx, (%r8, %r15, 4)";
dependentStores[2] = " mov %rdx, (%r8, %r15, 4)";
dependentStores[3] = " mov %rdx, (%r8, %r15, 4)";
string[] independentStores = new string[4];
independentStores[0] = " mov %r14, (%r8, %r11, 4)";
independentStores[1] = " mov %r14, (%r8, %r11, 4)";
independentStores[2] = " mov %r14, (%r8, %r11, 4)";
independentStores[3] = " mov %r14, (%r8, %r11, 4)";
UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentStores = new string[1];
dependentStores[0] = " str w25, [x2, w15, uxtw #2]";
string[] independentStores = new string[1];
independentStores[0] = " str w15, [x2, w15, uxtw #2]";
UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreDataNsqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreDataNsq : UarchTest
{
public StoreDataNsq(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storedatansq";
this.Description = "Store Data Scheduler, excluding NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
// if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " mov %rdi, (%r8)";
dependentLoads[1] = " mov %rdi, 8(%r8)";
dependentLoads[2] = " mov %rdi, 16(%r8)";
dependentLoads[3] = " mov %rdi, 24(%r8)";
string[] independentLoads = new string[4];
independentLoads[0] = " mov %r14, (%r8)";
independentLoads[1] = " mov %r14, 8(%r8)";
independentLoads[2] = " mov %r14, 16(%r8)";
independentLoads[3] = " mov %r14, 24(%r8)";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreDataSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreDataSchedTest : UarchTest
{
public StoreDataSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storedatasched";
this.Description = "Store Data Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " mov %rdi, (%r8)";
dependentLoads[1] = " mov %rdi, 8(%r8)";
dependentLoads[2] = " mov %rdi, 16(%r8)";
dependentLoads[3] = " mov %rdi, 24(%r8)";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " mov %rsi, (%r8)";
dependentLoads1[1] = " mov %rsi, 8(%r8)";
dependentLoads1[2] = " mov %rsi, 16(%r8)";
dependentLoads1[3] = " mov %rsi, 24(%r8)";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentLoads = new string[4];
dependentLoads[0] = " str w25, [x2, 8]";
dependentLoads[1] = " str w25, [x2, 16]";
dependentLoads[2] = " str w25, [x2, 24]";
dependentLoads[3] = " str w25, [x2, 32]";
string[] dependentLoads1 = new string[4];
dependentLoads1[0] = " str w26, [x2, 8]";
dependentLoads1[1] = " str w26, [x2, 16]";
dependentLoads1[2] = " str w26, [x2, 24]";
dependentLoads1[3] = " str w26, [x2, 32]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.mips64)
{
string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6";
string[] dependentLoads = new string[4];
dependentLoads[0] = " ld.d $r15, $r19, 0";
dependentLoads[1] = " ld.d $r16, $r19, 8";
dependentLoads[2] = " ld.d $r17, $r19, 12";
dependentLoads[3] = " ld.d $r18, $r19, 16";
string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
else if (isa == IUarchTest.ISA.riscv)
{
// x5 and x6 are pointer chasing loads
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12";
string[] dependentLoads = new string[4];
dependentLoads[0] = " ld x28, (x7)";
dependentLoads[1] = " ld x29, 8(x7)";
dependentLoads[2] = " ld x30, 16(x7)";
dependentLoads[3] = " ld x31, 24(x7)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreDivNsqTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreDivNsqTest : UarchTest
{
public StoreDivNsqTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storedivnsq";
this.Description = "Store Scheduler, using DIVs to block retirement, excluding NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// idiv puts remainder in RDX
string[] dependentStores = new string[4];
dependentStores[0] = " mov %r15w, (%r8, %rdx, 2)";
dependentStores[1] = " mov %r15w, 2(%r8, %rdx, 2)";
dependentStores[2] = " mov %r15w, 4(%r8, %rdx, 2)";
dependentStores[3] = " mov %r15w, 6(%r8, %rdx, 2)";
string[] indepStores = new string[4];
indepStores[0] = " mov %r11w, (%r8)";
indepStores[1] = " mov %r11w, 2(%r8)";
indepStores[2] = " mov %r11w, 4(%r8)";
indepStores[3] = " mov %r11w, 6(%r8)";
UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentStores = new string[1];
dependentStores[0] = " str w15, [x2, w25, uxtw #2]";
string[] independentStores = new string[1];
independentStores[0] = " str w15, [x2, w15, uxtw #2]";
UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreDivSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreDivSchedTest : UarchTest
{
public StoreDivSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storedivsched";
this.Description = "Store Address Scheduler Capacity Test, using divs to block retirement";
this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2";
this.GetFunctionCallParameters = "structIterations, list_size, B";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
GenerateX86Asm(sb);
}
else if (isa == IUarchTest.ISA.aarch64)
{
GenerateArmAsm(sb);
}
}
public void GenerateX86Asm(StringBuilder sb)
{
string[] dependentStores = new string[4];
dependentStores[0] = " mov %r15, (%r8, %rdx, 4)";
dependentStores[1] = " mov %r15, (%r8, %rdx, 4)";
dependentStores[2] = " mov %r15, (%r8, %rdx, 4)";
dependentStores[3] = " mov %r15, (%r8, %rdx, 4)";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " mov %r11, (%r8, %rdx, 4)";
dependentStores1[1] = " mov %r11, (%r8, %rdx, 4)";
dependentStores1[2] = " mov %r11, (%r8, %rdx, 4)";
dependentStores1[3] = " mov %r11, (%r8, %rdx, 4)";
// instead of using pointer chasing loads, use a nasty block of chained integer divisions to block retirement
// some older/less capable architectures will not reorder loads ahead of stores with unknown addresses,
// which breaks the usual technique
UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
}
public void GenerateArmAsm(StringBuilder sb)
{
string[] dependentStores = new string[4];
dependentStores[0] = " str w15, [x2, w25, uxtw #2]";
dependentStores[1] = " str w15, [x2, w25, uxtw #2]";
dependentStores[2] = " str w15, [x2, w25, uxtw #2]";
dependentStores[3] = " str w15, [x2, w25, uxtw #2]";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " str w15, [x2, w26, uxtw #2]";
dependentStores1[1] = " str w15, [x2, w26, uxtw #2]";
dependentStores1[2] = " str w15, [x2, w26, uxtw #2]";
dependentStores1[3] = " str w15, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false);
}
}
}
================================================
FILE: AsmGen/tests/StoreNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreNsq : UarchTest
{
public StoreNsq(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storensq";
this.Description = "Store Address Scheduler, Excluding any NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64)
{
string[] depStores = new string[4];
depStores[0] = " str w15, [x2, w25, uxtw #2]";
depStores[1] = " str w14, [x2, w25, uxtw #2]";
depStores[2] = " str w13, [x2, w25, uxtw #2]";
depStores[3] = " str w12, [x2, w25, uxtw #2]";
string[] indepStores = new string[4];
indepStores[0] = " str w15, [x2, w26, uxtw #2]";
indepStores[1] = " str w14, [x2, w26, uxtw #2]";
indepStores[2] = " str w13, [x2, w26, uxtw #2]";
indepStores[3] = " str w12, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores);
}
}
}
}
================================================
FILE: AsmGen/tests/StoreSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class StoreSchedTest : UarchTest
{
public StoreSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "storesched";
this.Description = "Store Address Scheduler";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] dependentStores = new string[4];
dependentStores[0] = " mov %r15, (%r8, %rdi, 4)";
dependentStores[1] = " mov %r14, (%r8, %rdi, 4)";
dependentStores[2] = " mov %r13, (%r8, %rdi, 4)";
dependentStores[3] = " mov %r12, (%r8, %rdi, 4)";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " mov %r15, (%r8, %rsi, 4)";
dependentStores1[1] = " mov %r14, (%r8, %rsi, 4)";
dependentStores1[2] = " mov %r13, (%r8, %rsi, 4)";
dependentStores1[3] = " mov %r12, (%r8, %rsi, 4)";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string[] dependentStores = new string[4];
dependentStores[0] = " str w15, [x2, w25, uxtw #2]";
dependentStores[1] = " str w14, [x2, w25, uxtw #2]";
dependentStores[2] = " str w13, [x2, w25, uxtw #2]";
dependentStores[3] = " str w12, [x2, w25, uxtw #2]";
string[] dependentStores1 = new string[4];
dependentStores1[0] = " str w15, [x2, w26, uxtw #2]";
dependentStores1[1] = " str w14, [x2, w26, uxtw #2]";
dependentStores1[2] = " str w13, [x2, w26, uxtw #2]";
dependentStores1[3] = " str w12, [x2, w26, uxtw #2]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
// x5 and x6 are pointer chasing loads
string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12";
string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12";
string[] dependentLoads = new string[4];
dependentLoads[0] = " sd x28, (a2)";
dependentLoads[1] = " sd x29, 8(a2)";
dependentLoads[2] = " sd x30, 16(a2)";
dependentLoads[3] = " sd x31, 24(a2)";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true,
postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2);
}
}
}
}
================================================
FILE: AsmGen/tests/Stq128Test.cs
================================================
using System.Text;
namespace AsmGen
{
public class Stq128Test : UarchTest
{
private bool initialDependentBranch;
public Stq128Test(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "stq128" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Store Queue with 128-bit stores" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " movups (%rdx), %xmm1";
string[] unrolledStores = new string[4];
unrolledStores[0] = " movaps %xmm1, (%r8)";
unrolledStores[1] = " movaps %xmm1, (%r8)";
unrolledStores[2] = " movaps %xmm1, (%r8)";
unrolledStores[3] = " movaps %xmm1, (%r8)";
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, initInstrs: initInstrs, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string initInstrs = " ldr q0, [x1]";
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] unrolledStores = new string[4];
unrolledStores[0] = " str q0, [x2]";
unrolledStores[1] = " str q0, [x2]";
unrolledStores[2] = " str q0, [x2]";
unrolledStores[3] = " str q0, [x2]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.riscv)
{
string initInstrs = " mv t6, x0\n addi t6, t6, 16\n vsetvli t5, t6, e32\n vlw.v v0, (a1)";
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty;
postLoadInstrs += "\n mv t6, a2";
string[] unrolledStores = new string[1];
unrolledStores[0] = " vsw.v v0, (t6)\n addi t6, t6, 64";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,
initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/Stq512Test.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class Stq512Test : UarchTest
{
private bool differentLines;
public Stq512Test(int low, int high, int step, bool differentLines)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "stq512" + (differentLines ? "dl" : string.Empty);
this.Description = "Store Queue with 512-bit stores - AVX-512 only";
if (differentLines) this.Description += " with multiple lines";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.differentLines = differentLines;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string initInstrs = " vmovaps (%r8), %zmm0\n vmovaps %zmm0, %zmm1";
string[] unrolledStores;
if (differentLines)
{
List unrolledStoresList = new List();
int maxOffset = 512, currentOffset = 0;
for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++)
{
string loadOffset = currentOffset > 0 ? currentOffset.ToString() : string.Empty;
string nextInstr = $" vmovaps %zmm0, {loadOffset}(%r8)";
unrolledStoresList.Add(nextInstr);
if (currentOffset >= maxOffset)
{
currentOffset = 0;
}
else currentOffset += 64;
unrolledStoresList.Add(" vmovaps %zmm0, (%r8)");
}
unrolledStores = unrolledStoresList.ToArray();
}
else
{
unrolledStores = new string[2];
unrolledStores[0] = " vmovaps %zmm0, (%r8)";
unrolledStores[1] = " vmovaps %zmm1, (%r8)";
}
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, initInstrs: initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/StqTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class StqTest : UarchTest
{
private bool initialDependentBranch;
private bool spaced;
public StqTest(int low, int high, int step, bool initialDependentBranch, bool spaced)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "stq" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Store Queue" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
this.spaced = spaced;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] unrolledStores;
string postLoadInstrs = "";
if (spaced)
{
postLoadInstrs = "mov %r8, %r11";
List storeInstrs = new List();
for (int i = 0; i < this.Counts[Counts.Length - 1]; i++)
{
// Send to different cache lines
storeInstrs.Add(" mov %r15, (%r11)\n add $64, %r11");
}
unrolledStores = storeInstrs.ToArray();
}
else
{
unrolledStores = new string[4];
unrolledStores[0] = " mov %r15, (%r8)";
unrolledStores[1] = " mov %r14, (%r8)";
unrolledStores[2] = " mov %r13, (%r8)";
unrolledStores[3] = " mov %r12, (%r8)";
}
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(
sb,
this.Counts,
this.Prefix,
unrolledStores,
unrolledStores,
postLoadInstrs1: postLoadInstrs,
postLoadInstrs2: postLoadInstrs,
includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] unrolledStores = new string[4];
unrolledStores[0] = " str x15, [x2]";
unrolledStores[1] = " str x14, [x2]";
unrolledStores[2] = " str x13, [x2]";
unrolledStores[3] = " str x12, [x2]";
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(
sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] unrolledStores = new string[4];
unrolledStores[0] = " st.d $r15, $r6, 0";
unrolledStores[1] = " st.d $r16, $r6, 0";
unrolledStores[2] = " st.d $r17, $r6, 0";
unrolledStores[3] = " st.d $r18, $r6, 0";
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false);
}
else if (isa == IUarchTest.ISA.riscv)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null;
string[] unrolledStores;
if (this.spaced)
{
List stores = new List();
for (int i = 0; i < 32; i++)
{
stores.Add($" sd x28, {i * 16}(x12)");
}
unrolledStores = stores.ToArray();
}
else
{
unrolledStores = new string[4];
unrolledStores[0] = " sd x28, (x12)";
unrolledStores[1] = " sd x29, 8(x12)";
unrolledStores[2] = " sd x30, 16(x12)";
unrolledStores[3] = " sd x31, 24(x12)";
}
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false,
postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix));
}
}
}
}
================================================
FILE: AsmGen/tests/TakenBranchBufferTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class TakenBranchBufferTest : UarchTest
{
private bool initialDependentBranch;
public TakenBranchBufferTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "tbb" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Taken Branch Buffer Test (taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb);
else if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb);
}
public void GenerateX86GccAsm(StringBuilder sb)
{
for (int i = 0; i < Counts.Length; i++)
{
string funcName = Prefix + Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r15");
sb.AppendLine(" push %r14");
sb.AppendLine(" push %r13");
sb.AppendLine(" push %r12");
sb.AppendLine(" push %r11");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %r15, %r15");
sb.AppendLine(" mov $0x1, %r14");
sb.AppendLine(" mov $0x2, %r13");
sb.AppendLine(" mov $0x3, %r12");
sb.AppendLine(" mov $0x4, %r11");
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_edi_target{fillerIdx}";
sb.AppendLine($" jmp {jumpLabel}");
sb.AppendLine(".align 16");
if (fillerIdx % 2 == 0) sb.AppendLine(" nop");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_esi_target{fillerIdx}";
sb.AppendLine($" jmp {jumpLabel}");
// try to space the jumps out a bit
sb.AppendLine(".align 16");
if (fillerIdx % 2 == 0) sb.AppendLine(" nop");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %r11");
sb.AppendLine(" pop %r12");
sb.AppendLine(" pop %r13");
sb.AppendLine(" pop %r14");
sb.AppendLine(" pop %r15");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
public void GenerateArmAsm(StringBuilder sb)
{
string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
for (int i = 0; i < Counts.Length; i++)
{
string funcName = Prefix + Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov x14, 2");
sb.AppendLine(" mov x13, 3");
sb.AppendLine(" mov x12, 4");
sb.AppendLine(" mov x11, 5");
sb.AppendLine(" mov x10, 6");
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w25_target{fillerIdx}";
sb.AppendLine($" b {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
if (this.initialDependentBranch) sb.AppendLine(dependentBranch);
for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++)
{
string jumpLabel = $"{funcName}_w26_target{fillerIdx}";
sb.AppendLine($" b {jumpLabel}");
sb.AppendLine($"{jumpLabel}:");
}
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
}
}
================================================
FILE: AsmGen/tests/TakenJumpSchedTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class TakenJumpSchedTest : UarchTest
{
public TakenJumpSchedTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "takenjumpsched";
this.Description = "Scheduler, Taken Jumps";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
// if (isa == IUarchTest.ISA.mips64) return true;
// if (isa == IUarchTest.ISA.riscv) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
for (int i = 0; i < this.Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" push %rsi");
sb.AppendLine(" push %rdi");
sb.AppendLine(" push %r8");
sb.AppendLine(" push %rcx");
sb.AppendLine(" push %rdx");
// arguments are in RDI, RSI, RDX, RCX, R8, and R9
// move them into familiar windows argument regs (rcx, rdx, r8)
sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx
sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi
sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi
sb.AppendLine(" xor %rdi, %rdi");
sb.AppendLine(" mov $0x40, %esi");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" mov (%rdx,%rdi,4), %edi");
for (int fillerIdx = 0;fillerIdx < this.Counts[i]; fillerIdx++)
{
string labelName = funcName + "part" + fillerIdx;
sb.AppendLine(" cmp %rdi, %rsi");
sb.AppendLine(" jne " + labelName);
sb.AppendLine(" inc %rax");
sb.AppendLine(".align 16");
sb.AppendLine(labelName + ":");
}
sb.AppendLine(" mov (%rdx,%rsi,4), %esi");
sb.AppendLine("lfence");
sb.AppendLine(" dec %rcx");
sb.AppendLine(" jne " + funcName + "start");
sb.AppendLine(" pop %rdx");
sb.AppendLine(" pop %rcx");
sb.AppendLine(" pop %r8");
sb.AppendLine(" pop %rdi");
sb.AppendLine(" pop %rsi");
sb.AppendLine(" ret\n\n");
}
}
else if (isa == IUarchTest.ISA.aarch64)
{
for (int i = 0; i < this.Counts.Length; i++)
{
string funcName = this.Prefix + this.Counts[i];
// args in x0, x1
sb.AppendLine("\n" + funcName + ":");
sb.AppendLine(" sub sp, sp, #0x50");
sb.AppendLine(" stp x14, x15, [sp, #0x10]");
sb.AppendLine(" stp x12, x13, [sp, #0x20]");
sb.AppendLine(" stp x10, x11, [sp, #0x30]");
sb.AppendLine(" stp x25, x26, [sp, #0x40]");
sb.AppendLine(" mov x15, 1");
sb.AppendLine(" mov w25, 0x0");
sb.AppendLine(" mov w26, 0x40");
sb.AppendLine("\n" + funcName + "start:");
sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current]
for (int nopIdx = 0; nopIdx < this.Counts[i]; nopIdx++)
{
string labelName = funcName + "part" + nopIdx;
sb.AppendLine(" cmp w25, w26");
sb.AppendLine(" b.ne " + labelName);
sb.AppendLine(" add x15, x15, 1");
sb.AppendLine(" nop\n nop\n nop");
sb.AppendLine(labelName + ":");
}
sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]");
sb.AppendLine(" dsb sy");
sb.AppendLine(" isb sy");
sb.AppendLine(" sub x0, x0, 1");
sb.AppendLine(" cbnz x0, " + funcName + "start");
sb.AppendLine(" ldp x25, x26, [sp, #0x40]");
sb.AppendLine(" ldp x10, x11, [sp, #0x30]");
sb.AppendLine(" ldp x12, x13, [sp, #0x20]");
sb.AppendLine(" ldp x14, x15, [sp, #0x10]");
sb.AppendLine(" add sp, sp, #0x50");
sb.AppendLine(" ret\n\n");
}
}
else if (isa == IUarchTest.ISA.riscv)
{
// todo
string[] unrolledAdds = new string[4];
unrolledAdds[0] = " mul x30, x30, x5";
unrolledAdds[1] = " mul x29, x29, x5";
unrolledAdds[2] = " mul x28, x28, x5";
unrolledAdds[3] = " mul x31, x31, x5";
string[] unrolledAdds1 = new string[4];
unrolledAdds1[0] = " mul x30, x30, x6";
unrolledAdds1[1] = " mul x31, x31, x6";
unrolledAdds1[2] = " mul x28, x28, x6";
unrolledAdds1[3] = " mul x29, x29, x6";
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false);
}
}
}
}
================================================
FILE: AsmGen/tests/Vec512RfTest.cs
================================================
using System.Collections.Generic;
using System.Text;
namespace AsmGen
{
public class Vec512RfTest : UarchTest
{
public Vec512RfTest(int low, int high, int step)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "vec512rf";
this.Description = "Vector (512-bit packed fp) RF Test - AVX-512 only";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
// it's ok, the ptr chasing arr should be way bigger than this
string initInstrs = " vmovups (%r8), %zmm1\n" +
" vmovups 64(%r8), %zmm2\n" +
" vmovups 128(%r8), %zmm3\n" +
" vmovups 192(%r8), %zmm4\n" +
" vmovups 256(%r8), %zmm5\n";
// use all zmm regs
for (int i = 6; i < 32; i++)
{
initInstrs += "vmovups %zmm5, %zmm" + i + "\n";
}
List instrsList = new List();
for (int i = 1; i < 32; i++)
{
instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}");
}
string[] unrolledAdds = instrsList.ToArray();
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs);
}
}
}
}
================================================
FILE: AsmGen/tests/VecMulNsq.cs
================================================
using System.Text;
namespace AsmGen
{
public class VecMulNsq : UarchTest
{
private int totalOps;
public VecMulNsq(int low, int high, int step, int totalOps)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "vecmulnsq" + totalOps;
this.Description = "Vector Integer Multiply, excluding possible NSQ";
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr";
this.GetFunctionCallParameters = "structIterations, A, fpArr";
this.DivideTimeByCount = false;
this.totalOps = totalOps;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.amd64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string postLoadInstrs = " mov %rdi, %r15\n add %r8, %r15\n movdqu (%r15), %xmm1";
string initInstrs = " movdqu (%r8), %xmm2";
string[] depInstrs = new string[4];
depInstrs[0] = " pmulld %xmm1, %xmm0";
depInstrs[1] = " pmulld %xmm1, %xmm3";
depInstrs[2] = " pmulld %xmm1, %xmm4";
depInstrs[3] = " pmulld %xmm1, %xmm5";
string[] indepInstrs = new string[2];
indepInstrs[0] = " pmulld %xmm2, %xmm6";
indepInstrs[1] = " pmulld %xmm2, %xmm7";
UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]";
string initInstrs = " ldr s15, [x2]";
string[] depInstrs = new string[4];
depInstrs[0] = " fadd s0, s0, s16";
depInstrs[1] = " fadd s1, s1, s16";
depInstrs[2] = " fadd s2, s2, s16";
depInstrs[3] = " fadd s3, s3, s16";
string[] indepInstrs = new string[4];
indepInstrs[0] = " fadd s17, s17, s15";
indepInstrs[1] = " fadd s18, s18, s15";
indepInstrs[2] = " fadd s19, s19, s15";
indepInstrs[3] = " fadd s20, s20, s15";
UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs,
postLoadInstrs: postLoadInstrs1);
}
}
}
}
================================================
FILE: AsmGen/tests/ZeroRobTest.cs
================================================
using System.Text;
namespace AsmGen
{
public class ZeroRobTest : UarchTest
{
private bool initialDependentBranch;
public ZeroRobTest(int low, int high, int step, bool initialDependentBranch)
{
this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step);
this.Prefix = "zerorob" + (initialDependentBranch ? "db" : string.Empty);
this.Description = "Reorder Buffer Test with Zeroing Idioms" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty);
this.FunctionDefinitionParameters = "uint64_t iterations, int *arr";
this.GetFunctionCallParameters = "structIterations, A";
this.DivideTimeByCount = false;
this.initialDependentBranch = initialDependentBranch;
}
public override bool SupportsIsa(IUarchTest.ISA isa)
{
if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false;
if (isa == IUarchTest.ISA.amd64) return true;
if (isa == IUarchTest.ISA.aarch64) return true;
if (isa == IUarchTest.ISA.mips64) return true;
return false;
}
public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa)
{
if (isa == IUarchTest.ISA.amd64)
{
string[] nops = new string[] { " xor %r11, %r11" };
UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
}
else if (isa == IUarchTest.ISA.aarch64)
{
string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null;
string[] nops = new string[] { " mov x10, 0" };
UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true);
if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix));
}
else if (isa == IUarchTest.ISA.mips64)
{
string[] nops = new string[] { " move $r14, $r0" };
UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
}
else if (isa == IUarchTest.ISA.riscv)
{
string[] nops = new string[] { " mov $r14, $r0" };
UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true);
}
}
}
}
================================================
FILE: CoherencyLatency/CoherencyLatency.cpp
================================================
#include
#include
#ifndef __MINGW32__
#include
#else
#include
#endif
#include
#include
#define ITERATIONS 10000000;
float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter);
float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter);
DWORD WINAPI LatencyTestThread(LPVOID param);
DWORD WINAPI ReadLatencyTestThread(LPVOID param);
LONG64* bouncyBase;
LONG64* bouncy;
typedef struct LatencyThreadData {
uint64_t start; // initial value to write into target
uint64_t iterations; // number of iterations to run
LONG64 *target; // value to bounce between threads, init with start - 1
LONG64 *readTarget; // for read test, memory location to read from (owned by other core)
DWORD affinityMask; // thread affinity mask to set
} LatencyData;
int main(int argc, char *argv[]) {
SYSTEM_INFO sysInfo;
DWORD numProcs;
float** latencies;
uint64_t iter = ITERATIONS;
int offsets = 1;
float (*test)(unsigned int, unsigned int, uint64_t) = RunTest;
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char* arg = argv[argIdx] + 1;
if (_strnicmp(arg, "iterations", 10) == 0) {
argIdx++;
iter = atoi(argv[argIdx]);
fprintf(stderr, "%lu iterations requested\n", iter);
}
else if (_strnicmp(arg, "bounce", 6) == 0) {
fprintf(stderr, "Bouncy\n");
}
else if (_strnicmp(arg, "owned", 5) == 0) {
test = RunOwnedTest;
fprintf(stderr, "Using separate cache lines for each thread to write to\n");
}
else if (_strnicmp(arg, "offset", 6) == 0) {
argIdx++;
offsets = atoi(argv[argIdx]);
fprintf(stderr, "Offsets: %d\n", offsets);
}
}
}
bouncyBase = (LONG64*)_aligned_malloc(64 * offsets, 4096);
bouncy = bouncyBase;
if (bouncy == NULL) {
fprintf(stderr, "Could not allocate aligned mem\n");
}
GetSystemInfo(&sysInfo);
numProcs = sysInfo.dwNumberOfProcessors;
fprintf(stderr, "Number of CPUs: %u\n", numProcs);
latencies = (float **)malloc(sizeof(float*) * offsets);
if (latencies == NULL) {
fprintf(stderr, "couldn't allocate result array\n");
return 0;
}
for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
bouncy = (LONG64*)((char*)bouncyBase + offsetIdx * 64);
latencies[offsetIdx] = (float*)malloc(sizeof(float) * numProcs * numProcs);
float* latenciesPtr = latencies[offsetIdx];
// Run all to all, skipping testing a core against itself ofc
// technically can skip the other way around (start j = i + 1) but meh
for (DWORD i = 0; i < numProcs; i++) {
for (DWORD j = 0; j < numProcs; j++) {
latenciesPtr[j + i * numProcs] = i == j ? 0 : test(i, j, iter);
}
}
}
for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
printf("Cache line offset: %d\n", offsetIdx);
float* latenciesPtr = latencies[offsetIdx];
// print thing to copy to excel
for (DWORD i = 0; i < numProcs; i++) {
for (DWORD j = 0; j < numProcs; j++) {
if (j != 0) printf(",");
if (j == i) printf("x");
else printf("%f", latenciesPtr[j + i * numProcs]);
}
printf("\n");
}
free(latenciesPtr);
}
free(latencies);
_aligned_free(bouncyBase);
return 0;
}
float TimeThreads(unsigned int processor1, unsigned int processor2, uint64_t iter, LatencyData lat1, LatencyData lat2, DWORD (*threadFunc)(LPVOID)) {
struct timeb start, end;
HANDLE testThreads[2];
DWORD tid1, tid2;
testThreads[0] = CreateThread(NULL, 0, threadFunc, &lat1, CREATE_SUSPENDED, &tid1);
testThreads[1] = CreateThread(NULL, 0, threadFunc, &lat2, CREATE_SUSPENDED, &tid2);
if (testThreads[0] == NULL || testThreads[1] == NULL) {
fprintf(stderr, "Failed to create test threads\n");
return -1;
}
SetThreadAffinityMask(testThreads[0], 1ULL << (uint64_t)processor1);
SetThreadAffinityMask(testThreads[1], 1ULL << (uint64_t)processor2);
ftime(&start);
ResumeThread(testThreads[0]);
ResumeThread(testThreads[1]);
WaitForMultipleObjects(2, testThreads, TRUE, INFINITE);
ftime(&end);
int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm);
float latency = 1e6 * (float)time_diff_ms / (float)iter;
fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency);
CloseHandle(testThreads[0]);
CloseHandle(testThreads[1]);
// each thread does interlocked compare and exchange iterations times. divide by 2 to get overall count of locked ops
return latency / 2;
}
///
/// Measures latency from one processor core to another
///
/// processor number 1
/// processor number 2
/// Number of iterations
/// aligned mem to bounce around
/// latency per iteration in ns
float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
LatencyData lat1, lat2;
float latency;
*bouncy = 0;
lat1.iterations = iter;
lat1.start = 1;
lat1.target = bouncy;
lat2.iterations = iter;
lat2.start = 2;
lat2.target = bouncy;
latency = TimeThreads(processor1, processor2, iter, lat1, lat2, LatencyTestThread);
return latency;
}
float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
LatencyData lat1, lat2;
LONG64* target1, * target2;
float latency;
// drop them on different cache lines
target1 = (LONG64*)_aligned_malloc(128, 64);
target2 = target1 + 8;
if (target1 == NULL) {
fprintf(stderr, "Could not allocate aligned mem\n");
}
*target1 = 1;
*target2 = 0;
lat1.iterations = iter;
lat1.start = 3;
lat1.target = target1;
lat1.readTarget = target2;
lat2.iterations = iter;
lat2.start = 2;
lat2.target = target2;
lat2.readTarget = target1;
latency = TimeThreads(processor1, processor2, iter, lat1, lat2, ReadLatencyTestThread);
_aligned_free(target1);
return latency;
}
///
/// Runs one thread of the latency test. should be run in pairs
/// Always writes to target
///
/// Latency test params
/// next value that would have been written to shared memory
DWORD WINAPI LatencyTestThread(LPVOID param) {
LatencyData *latencyData = (LatencyData *)param;
uint64_t current = latencyData->start;
while (current <= 2 * latencyData->iterations) {
if (_InterlockedCompareExchange64(latencyData->target, current, current - 1) == current - 1) {
current += 2;
}
}
return current;
}
///
/// Similar thing but tries to not bounce cache line ownership
/// Instead, threads write to different cache lines
///
/// Latency test params
/// next value that would have been written to owned mem
DWORD WINAPI ReadLatencyTestThread(LPVOID param) {
LatencyData* latencyData = (LatencyData*)param;
uint64_t current = latencyData->start;
uint64_t startTsc = __rdtsc();
while (current <= 2 * latencyData->iterations) {
if (*(latencyData->readTarget) == current - 1) {
*(latencyData->target) = current;
current += 2;
_mm_sfence();
}
}
return current;
}
================================================
FILE: CoherencyLatency/CoherencyLatency.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.31025.194
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CoherencyLatency", "CoherencyLatency.vcxproj", "{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32
{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325}
EndGlobalSection
EndGlobal
================================================
FILE: CoherencyLatency/CoherencyLatency.vcxproj
================================================
Debug
Win32
Release
Win32
Debug
x64
Release
x64
16.0
Win32Proj
{6d9ccc8c-09f5-484b-8630-be18a9cf1995}
CoherencyLatency
10.0
Application
true
v142
Unicode
Application
false
v142
true
Unicode
Application
true
v142
Unicode
Application
false
v142
true
Unicode
true
false
true
false
Level3
true
WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
Level3
true
_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
================================================
FILE: CoherencyLatency/Makefile
================================================
include ../Common/arch_detect.mk
CFLAGS = -pthread -O3
all: $(TARGET)
amd64:
$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS)
aarch64:
$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS)
riscv64:
$(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS)
w64:
$(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS)
# w64 can build with mingw 11, which isn't available on jammy
ci: amd64 aarch64 riscv64
clean:
rm -rf *.o *.zip "ocl-icd-libopencl1*" "OpenCL-SDK*" && find . -type f -executable -delete
.PHONY: all ci clean
================================================
FILE: CoherencyLatency/PThreadsCoherencyLatency.c
================================================
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define ITERATIONS 10000000;
// kidding right?
#define gettid() syscall(SYS_gettid)
typedef struct LatencyThreadData {
uint64_t start;
uint64_t iterations;
volatile uint64_t *target;
unsigned int processorIndex;
} LatencyData;
typedef struct LatencyPairRunData {
uint32_t processor1;
uint32_t processor2;
uint64_t iter;
float result;
uint64_t *target;
} LatencyPairRunData;
void *LatencyTestThread(void *param);
void *NoLockLatencyTestThread(void *param);
void *(*testFunc)(void *) = LatencyTestThread;
void *RunTest(void *param);
int main(int argc, char *argv[]) {
float **latencies;
int *parallelTestState;
int numProcs, offsets = 1, parallelismFactor = 1;
uint64_t iter = ITERATIONS;
uint64_t *bouncyArr;
numProcs = get_nprocs();
fprintf(stderr, "Number of CPUs: %u\n", numProcs);
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char* arg = argv[argIdx] + 1;
if (strncmp(arg, "iterations", 10) == 0) {
argIdx++;
iter = atoi(argv[argIdx]);
fprintf(stderr, "%lu iterations requested\n", iter);
}
else if (strncmp(arg, "nolock", 6) == 0) {
fprintf(stderr, "No locks, plain loads and stores\n");
testFunc = NoLockLatencyTestThread;
}
else if (strncmp(arg, "offset", 6) == 0) {
argIdx++;
offsets = atoi(argv[argIdx]);
fprintf(stderr, "Offsets: %d\n", offsets);
}
else if (strncmp(arg, "parallel", 8) == 0) {
argIdx++;
parallelismFactor = atoi(argv[argIdx]);
fprintf(stderr, "Will go for %d runs in parallel\n", parallelismFactor);
}
}
}
latencies = (float **)malloc(sizeof(float *) * offsets);
parallelTestState = (int *)malloc(sizeof(int) * numProcs * numProcs);
memset(latencies, 0, sizeof(float) * offsets);
if (0 != posix_memalign((void **)(&bouncyArr), 4096, 4096 * parallelismFactor)) {
fprintf(stderr, "Could not allocate aligned mem\n");
return 0;
}
LatencyPairRunData *pairRunData = (LatencyPairRunData *)malloc(sizeof(LatencyPairRunData) * parallelismFactor);
for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
latencies[offsetIdx] = (float *)malloc(sizeof(float) * numProcs * numProcs);
memset(parallelTestState, 0, sizeof(int) * numProcs * numProcs);
float *latenciesPtr = latencies[offsetIdx];
while (1) {
// select parallelismFactor threads
int selectedParallelTestCount = 0;
memset(pairRunData, 0, sizeof(LatencyPairRunData) * parallelismFactor);
for (int i = 0;i < numProcs && selectedParallelTestCount < parallelismFactor; i++) {
for (int j = 0;j < numProcs && selectedParallelTestCount < parallelismFactor; j++) {
if (j == i) { latenciesPtr[j + i * numProcs] = 0; continue; }
if (parallelTestState[j + i * numProcs] == 1) {
fprintf(stderr, "Thread unexpectedly did not complete\n");
exit(0);
}
if (parallelTestState[j + i * numProcs] == 0) {
// neither thread can already have a pending run
int validPair = 1;
for (int c = 0; c < numProcs; c++) {
if (parallelTestState[j + c * numProcs] == 1 ||
parallelTestState[c + i * numProcs] == 1 ||
parallelTestState[i + c * numProcs] == 1 ||
parallelTestState[c + j * numProcs] == 1) {
validPair = 0;
break;
}
}
if (!validPair) continue;
// for SMT enabled CPUs, check sibling threads. will do later
parallelTestState[j + i * numProcs] = 1;
pairRunData[selectedParallelTestCount].processor1 = i;
pairRunData[selectedParallelTestCount].processor2 = j;
pairRunData[selectedParallelTestCount].iter = iter;
pairRunData[selectedParallelTestCount].result = 0.0f;
pairRunData[selectedParallelTestCount].target = bouncyArr + (512 * selectedParallelTestCount + 8 * offsetIdx);
fprintf(stderr, "Selected %d -> %d\n", i, j);
selectedParallelTestCount++;
}
}
}
if (selectedParallelTestCount == 0) break;
// launch threads
fprintf(stderr, "Selected %d pairs for parallel testing\n", selectedParallelTestCount);
pthread_t *testThreads = (pthread_t *)malloc(selectedParallelTestCount * sizeof(pthread_t));
memset(testThreads, 0, selectedParallelTestCount * sizeof(pthread_t));
for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {
if (pairRunData[parallelIdx].processor1 == 0 && pairRunData[parallelIdx].processor2 == 0) break;
pthread_create(testThreads + parallelIdx, NULL, RunTest, (void *)(pairRunData + parallelIdx));
}
// join threads
for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) {
pthread_join(testThreads[parallelIdx], NULL);
int i = pairRunData[parallelIdx].processor1;
int j = pairRunData[parallelIdx].processor2;
latenciesPtr[j + i * numProcs] = pairRunData[parallelIdx].result;
parallelTestState[j + i * numProcs] = 2;
}
free(testThreads);
}
}
for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) {
float *latenciesPtr = latencies[offsetIdx];
printf("Cache line offset: %d\n", offsetIdx);
for (int i = 0;i < numProcs; i++) {
for (int j = 0;j < numProcs; j++) {
if (j != 0) printf(",");
if (j == i) printf("x");
// to maintain consistency, divide by 2 (see justification in windows version)
else printf("%f", latenciesPtr[j + i * numProcs] / 2);
}
printf("\n");
}
free(latenciesPtr);
}
free(parallelTestState);
free(pairRunData);
free(latencies);
free(bouncyArr);
return 0;
}
// run test and gather timing data using the specified thread function
float TimeThreads(unsigned int proc1,
unsigned int proc2,
uint64_t iter,
LatencyData *lat1,
LatencyData *lat2,
void *(*threadFunc)(void *)) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
pthread_t testThreads[2];
int t1rc, t2rc;
void *res1, *res2;
gettimeofday(&startTv, &startTz);
t1rc = pthread_create(&testThreads[0], NULL, threadFunc, (void *)lat1);
t2rc = pthread_create(&testThreads[1], NULL, threadFunc, (void *)lat2);
if (t1rc != 0 || t2rc != 0) {
fprintf(stderr, "Could not create threads\n");
return 0;
}
pthread_join(testThreads[0], &res1);
pthread_join(testThreads[1], &res2);
gettimeofday(&endTv, &endTz);
uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
float latency = 1e6 * (float)time_diff_ms / (float)iter;
return latency;
}
// test latency between two logical CPUs
// float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) {
void *RunTest(void *param) {
LatencyPairRunData *pairRunData = (LatencyPairRunData *)param;
uint32_t processor1 = pairRunData->processor1;
uint32_t processor2 = pairRunData->processor2;
uint64_t iter = pairRunData->iter;
LatencyData lat1, lat2;
float latency;
*(pairRunData->target) = 0;
lat1.iterations = iter;
lat1.start = 1;
lat1.target = pairRunData->target;
lat1.processorIndex = processor1;
lat2.iterations = iter;
lat2.start = 2;
lat2.target = pairRunData->target;
lat2.processorIndex = processor2;
latency = TimeThreads(processor1, processor2, iter, &lat1, &lat2, NoLockLatencyTestThread);
fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency);
pairRunData->result = latency;
return NULL;
}
void *LatencyTestThread(void *param) {
LatencyData *latencyData = (LatencyData *)param;
cpu_set_t cpuset;
uint64_t current = latencyData->start;
CPU_ZERO(&cpuset);
CPU_SET(latencyData->processorIndex, &cpuset);
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
//fprintf(stderr, "thread %ld set affinity %d\n", gettid(), latencyData->processorIndex);
while (current <= 2 * latencyData->iterations) {
if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) current += 2;
}
pthread_exit(NULL);
}
void *NoLockLatencyTestThread(void *param) {
LatencyData *latencyData = (LatencyData *)param;
cpu_set_t cpuset;
uint64_t current = latencyData->start;
CPU_ZERO(&cpuset);
CPU_SET(latencyData->processorIndex, &cpuset);
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
while (current <= 2 * latencyData->iterations) {
if (*(latencyData->target) == current - 1) {
*(latencyData->target) = current;
current += 2;
}
}
pthread_exit(NULL);
}
================================================
FILE: CoherencyLatency/c2cparse/Program.cs
================================================
// See https://aka.ms/new-console-template for more information
using System;
public class C2CParse
{
public static void Main(string[] args)
{
if (args.Length == 0)
{
Console.WriteLine("Need filename as arg");
return;
}
string[] inputLatencies = null;
string[] outputLatencies = null;
string inputFile = File.ReadAllText(args[0]);
string[] inputLines = inputFile.Split('\n');
for (int row = 0; row < inputLines.Length; row++)
{
string[] lineSplit = inputLines[row].Split(',');
if (inputLatencies == null)
{
inputLatencies = new string[inputLines.Length * lineSplit.Length];
outputLatencies = new string[inputLines.Length * lineSplit.Length];
if (inputLines.Length != lineSplit.Length)
{
Console.WriteLine("Line count: {0}, line segments: {1} must be equal", inputLines.Length, lineSplit.Length);
return;
}
}
for (int i = 0; i < inputLines.Length; i++)
{
inputLatencies[row * lineSplit.Length + i] = lineSplit[i];
}
}
for (int row = 0; row < inputLines.Length; row++)
{
for (int col = 0; col < inputLines.Length; col++)
{
string v1 = inputLatencies[row * inputLines.Length + col];
// translate both row and col
int newRow = GetCoreIndex(row, 4, 64);
int newCol = GetCoreIndex(col, 4, 64);
outputLatencies[newRow * inputLines.Length + newCol] = v1;
}
}
for (int row = 0; row < inputLines.Length; row++)
{
for (int col = 0; col < inputLines.Length; col++)
{
Console.Write(",{0}", outputLatencies[row * inputLines.Length + col]);
}
Console.WriteLine();
}
}
///
/// Convert linux index to windows index
///
///
///
///
///
public static int GetCoreIndex(int inputIndex, int smtCount, int coreCount)
{
int physicalCoreIndex = inputIndex % coreCount;
int smtIndex = inputIndex / coreCount;
return physicalCoreIndex * smtCount + smtIndex;
}
}
================================================
FILE: CoherencyLatency/c2cparse/c2cparse.csproj
================================================
Exe
net6.0
enable
enable
================================================
FILE: CoherencyLatency/c2cparse/c2cparse.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.4.33110.190
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "c2cparse", "c2cparse.csproj", "{F9E172EC-1A9A-4908-9512-4547CD1CFD80}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594}
EndGlobalSection
EndGlobal
================================================
FILE: Common/arch_detect.mk
================================================
TARGET ?= amd64
ifeq ($(OS),Windows_NT)
TARGET = w64
else
UNAME_M := $(shell uname -m)
ifeq ($(UNAME_M),x86_64)
TARGET = amd64
endif
ifeq ($(UNAME_M),aarch64)
TARGET = aarch64
endif
ifeq ($(UNAME_M),riscv64)
TARGET = riscv64
endif
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
TARGET = darwin
endif
endif
amd64: CC = x86_64-linux-gnu-gcc
amd64_numa: CC = x86_64-linux-gnu-gcc
aarch64: CC := gcc
aarch64_numa: CC = aarch64-linux-gnu-gcc
riscv64: CC = riscv64-linux-gnu-gcc
w64: CC = x86_64-w64-mingw32-gcc
darwin: CC = clang
================================================
FILE: Common/ci_gpumemlatency.sh
================================================
#!/bin/sh
make_all () {
make amd64
make clean-obj
LDFLAGS="-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL" make aarch64
make clean-obj
LDFLAGS="-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL" make riscv64
make clean-obj
CPPFLAGS="-I OpenCL-SDK-${OCL_VER}-Win-x64/include" LDFLAGS="-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL" make w64
make clean-obj
}
linux_deps () {
for ARCH in arm64 riscv64; do
if ! grep -q $ARCH /etc/apt/sources.list; then
echo "deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
echo "deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list
sudo apt update
fi
apt-get download "ocl-icd-libopencl1:${ARCH}"
find . -type f -name "*${ARCH}*.deb" -exec dpkg-deb -x {} "ocl-icd-${ARCH}" \;
done
cp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so
cp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so
}
w64_deps () {
curl -fssLO "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip"
unzip "OpenCL-SDK-${OCL_VER}-Win-x64.zip"
}
linux_deps
w64_deps
make_all
================================================
FILE: Common/ci_package.sh
================================================
#!/bin/sh
PKG="clammarks-$(git rev-parse --short HEAD)"
rm -rf "$PKG" "clammarks.txz"
mkdir -p "$PKG"
for TARGET in "amd64" "aarch64" "riscv64" "w64"; do
mkdir "$PKG/$TARGET"
for COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do
find "$COMPONENT" -type f -name "*$TARGET*" -executable -exec cp {} "$PKG/$TARGET" \;
done
find "GpuMemLatency" -type f -name "*.cl" -exec cp {} "$PKG/$TARGET" \;
done
cp "LICENSE" "$PKG"
tar caf "clammarks.txz" "$PKG"
================================================
FILE: Common/perfmon.h
================================================
// Stuff that only works on Linux. Should be #ifdef-ed out for mingw cross compilation
uint64_t readmsr(uint32_t coreindex, uint32_t msrindex) {
char buf[256];
memset(buf, 0, 256);
snprintf(buf, 256, "/dev/cpu/%d/msr", coreindex);
int fd;
uint64_t msrvalue = 0;
fd = open(buf, O_RDWR);
if (fd == -1) {
fprintf(stderr, "Could not open msr\n");
return 0;
}
lseek(fd, msrindex, SEEK_SET);
read(fd, &msrvalue, 8);
close(fd);
return msrvalue;
}
#define PERF_NUM_EVENTS 4
struct perf_read_data {
uint64_t nr;
struct {
uint64_t value;
uint64_t id;
} values[PERF_NUM_EVENTS];
};
struct perf_select_data {
uint64_t id; // id used to identify the event when it comes back in a group
int fd; // file descriptor
struct perf_event_attr attr;
uint64_t value;
const char *description;
};
struct perf_select_data perf_selected_events[PERF_NUM_EVENTS];
struct perf_read_data perfReadData;
struct timeval perf_startTv, perf_endTv;
uint64_t perf_time_ms;
// populates basic properties
void initialize_hw_event(struct perf_event_attr *attr, uint64_t cfg, uint32_t hwid) {
memset(attr, 0, sizeof(struct perf_event_attr));
// low 32 bits of config = hardware event id
// high 32 bits = PMU id (atom/core). Get from /sys/devices//type
// on Arrow Lake, atom = 10, core = 4
attr->config = cfg | ((uint64_t)hwid << 32);
attr->type = PERF_TYPE_HARDWARE;
attr->size = sizeof(struct perf_event_attr);
attr->disabled = 1;
attr->exclude_kernel = 1;
attr->exclude_hv = 1;
attr->inherit = 1; // include child threads
attr->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
}
void set_hw_event(struct perf_select_data *evt, int groupfd) {
evt->fd = syscall(__NR_perf_event_open, &(evt->attr), 0, -1, groupfd, 0);
ioctl(evt->fd, PERF_EVENT_IOC_ID, &(evt->id));
}
void open_perf_monitoring() {
int groupLeaderFd = -1;
memset(perf_selected_events, 0, sizeof(struct perf_select_data) * PERF_NUM_EVENTS);
perf_selected_events[0].description = "instructions";
initialize_hw_event(&(perf_selected_events[0].attr), PERF_COUNT_HW_INSTRUCTIONS, 0);
set_hw_event(perf_selected_events, -1);
groupLeaderFd = perf_selected_events[0].fd;
perf_selected_events[1].description = "cycles";
initialize_hw_event(&(perf_selected_events[1].attr), PERF_COUNT_HW_CPU_CYCLES, 0);
set_hw_event(perf_selected_events + 1, groupLeaderFd);
perf_selected_events[2].description = "llc_ref";
initialize_hw_event(&(perf_selected_events[2].attr), 0x4F2E, 0);
perf_selected_events[2].attr.type = PERF_TYPE_RAW;
set_hw_event(perf_selected_events + 2, groupLeaderFd);
perf_selected_events[3].description = "llc_miss";
initialize_hw_event(&(perf_selected_events[3].attr), 0x412E, 0);
perf_selected_events[3].attr.type = PERF_TYPE_RAW;
set_hw_event(perf_selected_events + 3, groupLeaderFd);
}
void start_perf_monitoring() {
gettimeofday(&perf_startTv, NULL);
int groupLeaderFd = perf_selected_events[0].fd;
ioctl(groupLeaderFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(groupLeaderFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}
uint64_t instrs, cycles, llcRef, llcMiss;
void stop_perf_monitoring() {
int readbytes = 0;
int groupLeaderFd = perf_selected_events[0].fd;
ioctl(groupLeaderFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
// fprintf(stderr, "read %d bytes\n", sizeof(struct perf_read_data));
readbytes = read(groupLeaderFd, &perfReadData, sizeof(struct perf_read_data));
//fprintf(stderr, "Read %d bytes into perf_read_data. nr = %lu\n", readbytes, perfReadData.nr);
for (int i = 0; i < perfReadData.nr; i++) {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
if (perf_selected_events[evt_idx].id == perfReadData.values[i].id) {
struct perf_select_data *selected_evt = perf_selected_events + evt_idx;
selected_evt->value = perfReadData.values[i].value;
// fprintf(stderr, "%s: %lu\n", selected_evt->description, selected_evt->value);
}
}
}
gettimeofday(&perf_endTv, NULL);
perf_time_ms = ((perf_endTv.tv_sec - perf_startTv.tv_sec) * 1000 + (perf_endTv.tv_usec - perf_startTv.tv_usec) / 1000);
}
void close_perf_monitoring() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) close(perf_selected_events[evt_idx].fd);
}
void append_perf_header() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
printf(",%s", perf_selected_events[evt_idx].description);
}
printf(",Time (ms)");
}
void append_perf_values() {
for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) {
printf(",%lu", perf_selected_events[evt_idx].value);
}
printf(",%lu", perf_time_ms);
}
================================================
FILE: Common/timing.c
================================================
#ifdef _MSC_VER
#include
__declspec(selectany) struct timeb start, end;
void start_timing() {
ftime(&start);
}
unsigned int end_timing() {
ftime(&end);
return 1000 * (end.time - start.time) + (end.millitm - start.millitm);
}
void start_timing_ts(struct timeb *startTimeb) {
ftime(startTimeb);
}
unsigned int end_timing_ts(struct timeb* startTimeb) {
struct timeb end;
ftime(&end);
return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm);
}
#else
#include
#include
struct timeval startTv, endTv;
void start_timing() {
gettimeofday(&startTv, NULL);
}
unsigned int end_timing() {
gettimeofday(&endTv, NULL);
return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000);
}
void start_timing_ts(struct timeval* start) {
gettimeofday(start, NULL);
}
unsigned int end_timing_ts(struct timeval* start) {
struct timeval end;
gettimeofday(&end, NULL);
return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000);
}
#endif
unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) {
// safety measure to deal with nasty timer precision issues if the system is fast
if (last_time < 50) return last_iteration_count * 2;
return last_iteration_count * (target_time / last_time);
}
================================================
FILE: Common/timing.h
================================================
#ifndef timingincluded
#define timingincluded
#ifdef _MSC_VER
#include
#else
#include
#endif
extern struct timeb start, end;
inline void start_timing();
inline unsigned int end_timing();
#ifdef _MSC_VER
void start_timing_ts(struct timeb* startTimeb);
unsigned int end_timing_ts(struct timeb* startTimeb);
#else
void start_timing_ts(struct timeval* start);
unsigned int end_timing_ts(struct timeval* start);
#endif
unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time);
#endif
================================================
FILE: CoreClockChecker/BoostClockChecker.c
================================================
#include
#include
#include
#include
#include
#include
#include
extern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi));
int main(int argc, char *argv[]) {
struct timeval startTv, endTv;
uint64_t iterations = 500000, samples = 100;
unsigned int sleepSeconds = 5;
time_t time_diff_ms;
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char *arg = argv[argIdx] + 1;
if (strncmp(arg, "samples", 7) == 0) {
argIdx++;
samples = atol(argv[argIdx]);
} else if (strncmp(arg, "iterations", 10) == 0) {
argIdx++;
iterations = atol(argv[argIdx]);
} else if (strncmp(arg, "sleep", 5) == 0) {
argIdx++;
sleepSeconds = atoi(argv[argIdx]);
}
}
}
sleep(sleepSeconds);
uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t));
for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
uint64_t elapsedTsc = clktsctest(iterations);
measuredTscs[sampleIdx] = elapsedTsc;
}
fprintf(stderr, "Used %lu samples\n", samples);
fprintf(stderr, "Used %lu iterations\n", iterations);
// figure out TSC to real time ratio
fprintf(stderr, "Checking TSC ratio...\n");
uint64_t iterationsHi = 8e9; // should be a couple seconds at least?
gettimeofday(&startTv, NULL);
uint64_t referenceElapsedTsc = clktsctest(iterationsHi);
gettimeofday(&endTv, NULL);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms;
float tsc_per_ns = tsc_per_ms / 1e6;
fprintf(stderr, "TSC = %lu, elapsed ms = %lu\n", referenceElapsedTsc, time_diff_ms);
fprintf(stderr, "TSC per ms: %f, TSC per ns: %f\n", tsc_per_ms, tsc_per_ns);
printf("Time (ms), Clk (GHz), TSC\n");
float elapsedTime = 0;
for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) {
// (tsc / ms) * tsc = 1 / ms
float elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms;
elapsedTime += elapsedTimeMs;
float latency = 1e6 * elapsedTimeMs / (float)iterations;
float addsPerNs = 1 / latency;
printf("%f,%f,%lu\n", elapsedTime, addsPerNs, measuredTscs[sampleIdx]);
}
return 0;
}
================================================
FILE: CoreClockChecker/BoostClockChecker_arm.s
================================================
.text
.global clktsctest
.global _clktsctest
.balign 4
/* x0 = iterations, return elapsed TSC in x0 */
_clktsctest:
clktsctest:
sub sp, sp, #0x40
stp x10, x11, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x14, x15, [sp, #0x30]
mov x10, 1
mov x11, 20
mov x12, 0
/* stackoverflow says this is a good idea */
mrs x14, cntvct_el0
clktsctest_loop:
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
add x12, x12, x10
sub x0, x0, x11
cbnz x0, clktsctest_loop
mrs x15, cntvct_el0
sub x0, x15, x14
ldp x14, x15, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x10, x11, [sp, #0x10]
add sp, sp, #0x40
ret
================================================
FILE: CoreClockChecker/BoostClockChecker_x86.s
================================================
.global clktsctest
/* rcx = iterations, return elapsed TSC in rax */
clktsctest:
push %rdx
push %rbx
push %r8
push %r9
push %r10
mov %rcx, %rdi
mov $1, %r8
mov $20, %r9
xor %rbx, %rbx
rdtsc /* high 32 bits in EDX, low 32 bits in EAX */
shl $32, %rdx /* shift high 32 bits into upper half of EDX */
add %rax, %rdx /* place full 64-bit value in rdx */
mov %rdx, %r10
clktsctest_loop:
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
sub %r9, %rdi
jnz clktsctest_loop
rdtsc
shl $32, %rdx
add %rdx, %rax /* now rax has the new value */
sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */
pop %r10
pop %r9
pop %r8
pop %rbx
pop %rdx
ret
================================================
FILE: CoreClockChecker/CoreClockChecker.c
================================================
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MSR_RAPL_PWR_UNIT 0xC0010299
#define HWCR 0xC0010015
#define MSR_CORE_ENERGY_STAT 0xC001029A
#define MSR_PKG_ENERGY_STAT 0xC001029B
#define INTEL_MSR_RAPL_PWR_UNIT 0x606
#define INTEL_MSR_PP0_ENERGY_STATUS 0x639
#define INTEL_MSR_PKG_ENERGY_STATUS 0x611
extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi));
void detectCpuMaker();
void setBoost(int on);
void setAffinity(int core);
int openMsr(int core);
uint64_t readMsr(int fd, uint32_t addr);
void writeMsr(int fd, uint32_t addr, uint64_t value);
float getEnergyStatusUnits();
uint64_t getCoreEnergyStat(int core);
uint64_t getPkgEnergyStat(int core);
uint64_t getTotalCoreEnergy();
int *msrFds;
int amdCpu = 1;
int numProcs = 0;
int main(int argc, char *argv[]) {
struct timeval startTv, endTv;
time_t time_diff_ms;
float latency, clockSpeedGhz, energyUnits;
uint64_t startEnergy, endEnergy, startPkgEnergy, endPkgEnergy;
uint64_t iterationsHigh = 8e9;
detectCpuMaker();
numProcs = get_nprocs();
fprintf(stderr, "Number of CPUs: %u\n", numProcs);
msrFds = (int *)malloc(sizeof(int) * numProcs);
memset(msrFds, 0, sizeof(int) * numProcs);
if (argc > 1 && strncmp(argv[1], "disableboost", 12) == 0) {
setBoost(0);
} else if (argc > 1 && strncmp(argv[1], "enableboost", 11) == 0) {
setBoost(1);
} else if (argc > 1 && strncmp(argv[1], "power", 5) == 0) {
iterationsHigh *= 2; // try for more accuracy
energyUnits = getEnergyStatusUnits();
printf("Core, Core Power, Package Power\n");
for (int i = 0; i < numProcs; i++) {
setAffinity(i);
gettimeofday(&startTv, NULL);
startEnergy = getCoreEnergyStat(i);
startPkgEnergy = getPkgEnergyStat(i);
clktest(iterationsHigh);
endPkgEnergy = getPkgEnergyStat(i);
endEnergy = getCoreEnergyStat(i);
gettimeofday(&endTv, NULL);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
clockSpeedGhz = 1 / latency;
//printf("runtime: %llu ms\n", time_diff_ms);
//printf("%d, %f GHz\n", i, clockSpeedGhz);
printf("%d, %f, %f\n", i,
((endEnergy - startEnergy) * energyUnits) / (time_diff_ms / 1000),
((endPkgEnergy - startPkgEnergy) * energyUnits) / (time_diff_ms / 1000));
}
} else if (argc > 2 && strncmp(argv[1], "measurecmd", 9) == 0) {
int rc;
float coreJoules, pkgJoules;
fprintf(stderr, "argv[2] is %s\nOnly handling Intel at the moment\n", argv[2]);
energyUnits = getEnergyStatusUnits();
gettimeofday(&startTv, NULL);
startEnergy = getTotalCoreEnergy();
startPkgEnergy = getPkgEnergyStat(0);
rc = system(argv[2]);
endEnergy = getTotalCoreEnergy();
endPkgEnergy = getPkgEnergyStat(0);
gettimeofday(&endTv, NULL);
fprintf(stderr, "system() returned %d\n", rc);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
coreJoules = (endEnergy - startEnergy) * energyUnits;
pkgJoules = (endPkgEnergy - startPkgEnergy) * energyUnits;
printf("Core Joules: %f\n", coreJoules);
printf("Package Joules: %f\n", pkgJoules);
printf("Elapsed time, seconds: %f\n", (double)time_diff_ms / 1000);
}
else {
for (int i = 0; i < numProcs; i++) {
setAffinity(i);
gettimeofday(&startTv, NULL);
clktest(iterationsHigh);
gettimeofday(&endTv, NULL);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh;
clockSpeedGhz = 1 / latency;
//printf("runtime: %llu ms\n", time_diff_ms);
printf("%d, %f GHz\n", i, clockSpeedGhz);
}
}
free(msrFds);
return 0;
}
void detectCpuMaker() {
uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx;
uint32_t *uintPtr;
char cpuName[13];
amdCpu = 0;
__cpuid_count(0, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx);
uintPtr = (uint32_t *)cpuName;
uintPtr[0] = cpuidEbx;
uintPtr[1] = cpuidEdx;
uintPtr[2] = cpuidEcx;
cpuName[12] = 0;
fprintf(stderr, "CPU name: %s\n", cpuName);
if (memcmp(cpuName, "GenuineIntel", 12) == 0) {
amdCpu = 0;
fprintf(stderr, "Looks like Intel\n");
} else if (memcmp(cpuName, "AuthenticAMD", 12) == 0) {
amdCpu = 1;
fprintf(stderr, "Looks like AMD\n");
}
}
void setAffinity(int core) {
int rc;
cpu_set_t cpuset;
pthread_t thread = pthread_self();
CPU_ZERO(&cpuset);
CPU_SET(core, &cpuset);
rc = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset);
if (rc != 0) {
fprintf(stderr, "unable to set thread affinity to %d\n", core);
}
}
int openMsr(int core) {
char msrFilename[255];
int fd;
sprintf(msrFilename, "/dev/cpu/%d/msr", core);
fd = open(msrFilename, O_RDWR);
if (fd < 0) {
fprintf(stderr, "Could not open MSR file, core %d\n", core);
return -1;
}
return fd;
}
uint64_t readMsr(int fd, uint32_t addr) {
uint64_t result, bytesRead;
bytesRead = pread(fd, &result, sizeof(result), addr);
if (bytesRead != sizeof(result)) {
fprintf(stderr, "Could not read from fd %d, msr %u\n", fd, addr);
}
return result;
}
void writeMsr(int fd, uint32_t addr, uint64_t value) {
uint64_t bytesWritten, newValue;
bytesWritten = pwrite(fd, &value, sizeof(value), addr);
if (bytesWritten != sizeof(value)) {
fprintf(stderr, "Could not write to fd %d, msr %u, value %lu\n", fd, addr, value);
}
newValue = readMsr(fd, addr);
if (value != newValue) {
fprintf(stderr, "Wrote to fd %d, msr %u, value %lu, but write did not take effect\n", fd, addr, value);
}
}
void setBoost(int on) {
uint64_t hwcrValue;
for (int i = 0; i < numProcs; i++) {
setAffinity(i);
if (!msrFds[i]) msrFds[i] = openMsr(i);
hwcrValue = readMsr(msrFds[i], HWCR);
if (on) {
hwcrValue &= ~(1UL << 25); // unset bit to request CPB on
//fprintf(stderr, "Requesting CPB on (unsetting bit 25 in HWCR): 0x%08x\n", hwcrValue);
} else {
hwcrValue |= (1UL << 25); // set bit to disable CPB
//fprintf(stderr, "Requesting CPB off (setting bit 25 in HWCR): 0x%08x\n", hwcrValue);
}
writeMsr(msrFds[i], HWCR, hwcrValue);
}
}
float getEnergyStatusUnits() {
uint64_t energyUnits, raplPwrUnit;
setAffinity(0);
if (!msrFds[0]) msrFds[0] = openMsr(0);
if (amdCpu) {
raplPwrUnit = readMsr(msrFds[0], MSR_RAPL_PWR_UNIT);
}
else
{
raplPwrUnit = readMsr(msrFds[0], INTEL_MSR_RAPL_PWR_UNIT);
}
energyUnits = (raplPwrUnit >> 8) & 0x1F;
return (float)pow(0.5, (double)energyUnits);
}
uint64_t getCoreEnergyStat(int core) {
if (!msrFds[core]) msrFds[core] = openMsr(core);
if (amdCpu)
return readMsr(msrFds[core], MSR_CORE_ENERGY_STAT);
else
return readMsr(msrFds[core], INTEL_MSR_PP0_ENERGY_STATUS);
}
uint64_t getPkgEnergyStat(int core) {
if (!msrFds[core]) msrFds[core] = openMsr(core);
if (amdCpu)
return readMsr(msrFds[core], MSR_PKG_ENERGY_STAT);
else
return readMsr(msrFds[core], INTEL_MSR_PKG_ENERGY_STATUS);
}
uint64_t getTotalCoreEnergy() {
if (amdCpu) {
uint64_t totalCoreEnergy = 0;
// only testing the 5950X and 3950X for now, and physical cores
// are 0-15 on linux. hack around this until I have time to
// programatically figure out SMT siblings
for (int i = 0; i < 16; i++) {
totalCoreEnergy += getCoreEnergyStat(i);
}
return totalCoreEnergy;
} else {
// intel does not track power per core
return getCoreEnergyStat(0);
}
}
================================================
FILE: CoreClockChecker/CoreClockChecker_x86.s
================================================
.global clktest
/*
%rdi = arg0 = iteration count
*/
clktest:
push %rbx
push %r8
push %r9
mov $1, %r8
mov $20, %r9
xor %rbx, %rbx
clktest_loop:
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
add %r8, %rbx
sub %r9, %rdi
jnz clktest_loop
pop %r9
pop %r8
pop %rbx
ret
================================================
FILE: CoreClockChecker/Makefile
================================================
include ../Common/arch_detect.mk
CFLAGS = -O3
LDFLAGS = -lm
all: $(TARGET)
amd64:
$(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS)
$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS)
aarch64:
$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS)
w64:
$(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS)
ci: amd64 aarch64 w64
clean:
rm -f *.o && find . -type f -executable -delete
.PHONY: all ci clean
================================================
FILE: CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm
================================================
section .text
bits 64
global clktest
; rcx = iteration count
; rdx = address of memory location to monitor
; return elapsed tsc
clktest:
push rdx
push rbx
push r8
push r9
push r10
push r11
xor rbx, rbx
mov r8, 1 ; GLC will eliminate adds with immediates or increments
clktest_loop:
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
add rbx, r8
mov r11d, [rdx]
test r11d, r11d
jnz clktest_loop_end ; early exit condition (someone else exited)
sub rcx, 20
jg clktest_loop
mov [rdx], r8
clktest_loop_end:
mov rax, rbx
pop r11
pop r10
pop r9
pop r8
pop rbx
pop rdx
ret
================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.cpp
================================================
// WinCoreClockChecker.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include
#include
#include
#include
#include
extern "C" uint64_t clktest(uint64_t iterations, uint64_t *flag);
int ECoreTestOrder[] = { 2, 3, 4, 5, 6, 7, 8, 9 };
int BackwardECoreTestOrder[] = { 9, 8, 7, 6, 5, 4, 3, 2 };
int AlternatingECoreTestOrder[] = { 2, 6, 3, 7, 4, 8, 5, 9 };
int PCoreTestOrder[] = { 12, 10, 14, 16, 18, 0 };
int AllECores[] = { 20, 21, 2, 3, 4, 5, 6, 7, 8, 9 };
int AllCores[] = { 12, 10, 14, 16, 18, 0, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21 };
struct ClockTestData {
uint64_t iterations;
uint64_t completed_iterations;
uint64_t *flag;
};
float* runMtClockTest(int* cores, int nCores);
void PrintResults(int* cores, float* results, int coreCount);
void RunCoreByCoreClockTest(int* cores, int coreCount);
void RunEvenCoreTest(int coreCount);
uint64_t start_iterations = 8e9;
int main(int argc, char *argv[])
{
// Test E-Cores one by one
start_iterations = 8e9;
if (argc > 1)
{
int evenCoreCount = atoi(argv[1]);
printf("Even Cores, core count %d\n");
RunEvenCoreTest(evenCoreCount);
}
int eCoreCount = sizeof(ECoreTestOrder) / sizeof(int);
printf("E-Cores, Warmup:\n");
RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));
printf("E-Cores, filling one cluster first:\n");
RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int));
printf("E-Cores, filling other cluster first but still one cluster at a time:\n");
RunCoreByCoreClockTest(BackwardECoreTestOrder, sizeof(BackwardECoreTestOrder) / sizeof(int));
printf("E-Cores, alternating cores between clusters:\n");
RunCoreByCoreClockTest(AlternatingECoreTestOrder, sizeof(AlternatingECoreTestOrder) / sizeof(int));
printf("E-Cores, LPE first:\n");
RunCoreByCoreClockTest(AllECores, sizeof(AllECores) / sizeof(int));
start_iterations = 12e9;
printf("P-Cores, warmup:\n");
RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));
printf("P-Cores, fastest core first:\n");
RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int));
printf("All cores, fastest core first:\n");
RunCoreByCoreClockTest(AllCores, sizeof(AllCores) / sizeof(int));
return 0;
}
void RunEvenCoreTest(int coreCount)
{
int* coreSequence = (int *)malloc(sizeof(int) * coreCount);
for (int i = 0; i < coreCount; i++)
{
coreSequence[i] = i * 2;
}
RunCoreByCoreClockTest(coreSequence, coreCount);
free(coreSequence);
}
void RunCoreByCoreClockTest(int *cores, int coreCount)
{
float* coreByCoreResults = (float*)malloc(sizeof(float) * coreCount * coreCount);
memset(coreByCoreResults, 0, sizeof(float) * coreCount * coreCount);
for (int i = 0; i < coreCount; i++)
{
float* results = runMtClockTest(cores, i + 1);
for (int j = 0; j < (i + 1); j++)
{
coreByCoreResults[coreCount * i + j] = results[j];
}
free(results);
}
PrintResults(cores, coreByCoreResults, coreCount);
free(coreByCoreResults);
}
void PrintResults(int *cores, float* results, int coreCount)
{
// print csv header
for (int i = 0; i < coreCount; i++)
{
printf(",%d", cores[i]);
}
printf("\n");
for (int currentCoreCountIndex = 0; currentCoreCountIndex < coreCount; currentCoreCountIndex++)
{
printf("%d", currentCoreCountIndex + 1);
for (int currentCoreIdx = 0; currentCoreIdx < coreCount; currentCoreIdx++)
{
float currentResult = results[coreCount * currentCoreCountIndex + currentCoreIdx];
if (currentResult != 0.0f) printf(",%f", currentResult);
else printf(",-");
}
printf("\n");
}
}
DWORD WINAPI ClockTestThread(LPVOID param)
{
struct ClockTestData* testData = (struct ClockTestData*)param;
testData->completed_iterations = clktest(testData->iterations, testData->flag);
return 0;
}
// cores = array of test order -> logical core id
float* runMtClockTest(int* cores, int nCores)
{
struct timeb start, end;
struct ClockTestData* threadData = (struct ClockTestData*)malloc(sizeof(struct ClockTestData) * nCores);
float* results = (float*)malloc(sizeof(float) * nCores);
memset(results, 0, sizeof(float) * nCores);
HANDLE* testThreads = (HANDLE*)malloc(sizeof(HANDLE) * nCores);
// try to align test times
float maxThreadTsc, minThreadTsc;
float time_diff_sec;
uint64_t flag = 0;
for (int i = 0; i < nCores; i++)
{
threadData[i].iterations = start_iterations;
threadData[i].flag = &flag;
testThreads[i] = CreateThread(NULL, 0, ClockTestThread, threadData + i, CREATE_SUSPENDED, NULL);
SetThreadAffinityMask(testThreads[i], 1ULL << (uint64_t)cores[i]);
}
ftime(&start);
for (int i = 0; i < nCores; i++)
{
ResumeThread(testThreads[i]);
}
WaitForMultipleObjects(nCores, testThreads, TRUE, INFINITE);
ftime(&end);
time_diff_sec = (float)(end.time - start.time) + 0.001f * (end.millitm - start.millitm);
for (int i = 0; i < nCores; i++)
{
// fprintf(stderr, "Core %d: %llu iterations in %f sec\n", cores[i], threadData[i].completed_iterations, time_diff_sec);
float ghz = ((float)threadData[i].completed_iterations / 1e9) / time_diff_sec;
// fprintf(stderr, "Core %d: %f GHz\n", cores[i], ghz);
results[i] = ghz;
}
free(testThreads);
free(threadData);
return results;
}
================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.9.34723.18
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinCoreClockChecker", "WinCoreClockChecker.vcxproj", "{D70EC1DD-794C-4156-8483-227E566CC76B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64
{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64
{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32
{D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32
{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64
{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64
{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32
{D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1}
EndGlobalSection
EndGlobal
================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj
================================================
Debug
Win32
Release
Win32
Debug
x64
Release
x64
17.0
Win32Proj
{d70ec1dd-794c-4156-8483-227e566cc76b}
WinCoreClockChecker
10.0
Application
true
v143
Unicode
Application
false
v143
true
Unicode
Application
true
v143
Unicode
Application
false
v143
true
Unicode
Level3
true
WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
Level3
true
_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
Document
nasm -f win64 CoreClockCheckFunctions.asm
CoreClockCheckFunctions.obj
nasm -f win64 CoreClockCheckFunctions.asm
CoreClockCheckFunctions.obj
nasm -f win64 CoreClockCheckFunctions.asm
CoreClockCheckFunctions.obj
nasm -f win64 CoreClockCheckFunctions.asm
CoreClockCheckFunctions.obj
================================================
FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters
================================================
{4FC737F1-C7A5-4376-A066-2A32D752A2FF}
cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx
{93995380-89BD-4b04-88EB-625FBE52EBFB}
h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd
{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
Source Files
Source Files
================================================
FILE: GpuMemLatency/Makefile
================================================
include ../Common/arch_detect.mk
OCL_VER = v2023.04.17
CI_SCRIPT = ../Common/ci_gpumemlatency.sh
CFLAGS = -O3 -I ../Common
DEPS = ../Common/timings.h
OBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o
LDFLAGS ?= -lm -lOpenCL
ifeq ($(TARGET), Darwin)
LDFLAGS = -lm -framework OpenCL
endif
all: $(TARGET)
GpuMemLatency: $(OBJ)
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS)
%.o: %.c $(DEPS)
$(CC) $(CFLAGS) -c -o $@ $<
timing.o:
$(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o
amd64: $(OBJ)
$(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS)
aarch64: $(OBJ)
$(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS)
riscv64: $(OBJ)
$(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS)
w64: $(OBJ)
$(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS)
darwin: $(OBJ)
$(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS)
ci: clean
@OCL_VER=$(OCL_VER) sh $(CI_SCRIPT)
clean-ci:
rm -rf "*.deb" "*.zip" "ocl-icd-*" "OpenCL-SDK-*"
clean-obj:
rm -f *.o
clean: clean-ci clean-obj
find . -type f -executable -delete
.PHONY: all ci clean-ci clean-obj clean
================================================
FILE: GpuMemLatency/OpenCL/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: GpuMemLatency/OpenCL/README.md
================================================
# OpenCLTM API Headers
This repository contains C language headers for the OpenCL API.
The authoritative public repository for these headers is located at:
https://github.com/KhronosGroup/OpenCL-Headers
Issues, proposed fixes for issues, and other suggested changes should be
created using Github.
## Branch Structure
The OpenCL API headers in this repository are Unified headers and are designed
to work with all released OpenCL versions. This differs from previous OpenCL
API headers, where version-specific API headers either existed in separate
branches, or in separate folders in a branch.
## Compiling for a Specific OpenCL Version
By default, the OpenCL API headers in this repository are for the latest
OpenCL version (currently OpenCL 2.2). To use these API headers to target
a different OpenCL version, an application may `#define` the preprocessor
value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers.
The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing
the OpenCL API version.
For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may
include the OpenCL API headers as follows:
```
#define CL_TARGET_OPENCL_VERSION 120
#include
```
## Directory Structure
```
README.md This file
LICENSE Source license for the OpenCL API headers
CL/ Unified OpenCL API headers tree
```
## License
See [LICENSE](LICENSE).
---
OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos.
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_H
#define __OPENCL_CL_H
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
typedef struct _cl_platform_id * cl_platform_id;
typedef struct _cl_device_id * cl_device_id;
typedef struct _cl_context * cl_context;
typedef struct _cl_command_queue * cl_command_queue;
typedef struct _cl_mem * cl_mem;
typedef struct _cl_program * cl_program;
typedef struct _cl_kernel * cl_kernel;
typedef struct _cl_event * cl_event;
typedef struct _cl_sampler * cl_sampler;
typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
typedef cl_ulong cl_bitfield;
typedef cl_bitfield cl_device_type;
typedef cl_uint cl_platform_info;
typedef cl_uint cl_device_info;
typedef cl_bitfield cl_device_fp_config;
typedef cl_uint cl_device_mem_cache_type;
typedef cl_uint cl_device_local_mem_type;
typedef cl_bitfield cl_device_exec_capabilities;
#ifdef CL_VERSION_2_0
typedef cl_bitfield cl_device_svm_capabilities;
#endif
typedef cl_bitfield cl_command_queue_properties;
#ifdef CL_VERSION_1_2
typedef intptr_t cl_device_partition_property;
typedef cl_bitfield cl_device_affinity_domain;
#endif
typedef intptr_t cl_context_properties;
typedef cl_uint cl_context_info;
#ifdef CL_VERSION_2_0
typedef cl_bitfield cl_queue_properties;
#endif
typedef cl_uint cl_command_queue_info;
typedef cl_uint cl_channel_order;
typedef cl_uint cl_channel_type;
typedef cl_bitfield cl_mem_flags;
#ifdef CL_VERSION_2_0
typedef cl_bitfield cl_svm_mem_flags;
#endif
typedef cl_uint cl_mem_object_type;
typedef cl_uint cl_mem_info;
#ifdef CL_VERSION_1_2
typedef cl_bitfield cl_mem_migration_flags;
#endif
typedef cl_uint cl_image_info;
#ifdef CL_VERSION_1_1
typedef cl_uint cl_buffer_create_type;
#endif
typedef cl_uint cl_addressing_mode;
typedef cl_uint cl_filter_mode;
typedef cl_uint cl_sampler_info;
typedef cl_bitfield cl_map_flags;
#ifdef CL_VERSION_2_0
typedef intptr_t cl_pipe_properties;
typedef cl_uint cl_pipe_info;
#endif
typedef cl_uint cl_program_info;
typedef cl_uint cl_program_build_info;
#ifdef CL_VERSION_1_2
typedef cl_uint cl_program_binary_type;
#endif
typedef cl_int cl_build_status;
typedef cl_uint cl_kernel_info;
#ifdef CL_VERSION_1_2
typedef cl_uint cl_kernel_arg_info;
typedef cl_uint cl_kernel_arg_address_qualifier;
typedef cl_uint cl_kernel_arg_access_qualifier;
typedef cl_bitfield cl_kernel_arg_type_qualifier;
#endif
typedef cl_uint cl_kernel_work_group_info;
#ifdef CL_VERSION_2_1
typedef cl_uint cl_kernel_sub_group_info;
#endif
typedef cl_uint cl_event_info;
typedef cl_uint cl_command_type;
typedef cl_uint cl_profiling_info;
#ifdef CL_VERSION_2_0
typedef cl_bitfield cl_sampler_properties;
typedef cl_uint cl_kernel_exec_info;
#endif
#ifdef CL_VERSION_3_0
typedef cl_bitfield cl_device_atomic_capabilities;
typedef cl_uint cl_khronos_vendor_id;
typedef cl_bitfield cl_mem_properties;
typedef cl_uint cl_version;
#endif
typedef struct _cl_image_format {
cl_channel_order image_channel_order;
cl_channel_type image_channel_data_type;
} cl_image_format;
#ifdef CL_VERSION_1_2
typedef struct _cl_image_desc {
cl_mem_object_type image_type;
size_t image_width;
size_t image_height;
size_t image_depth;
size_t image_array_size;
size_t image_row_pitch;
size_t image_slice_pitch;
cl_uint num_mip_levels;
cl_uint num_samples;
#ifdef CL_VERSION_2_0
#ifdef __GNUC__
__extension__ /* Prevents warnings about anonymous union in -pedantic builds */
#endif
#ifdef _MSC_VER
#pragma warning( push )
#pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */
#endif
union {
#endif
cl_mem buffer;
#ifdef CL_VERSION_2_0
cl_mem mem_object;
};
#ifdef _MSC_VER
#pragma warning( pop )
#endif
#endif
} cl_image_desc;
#endif
#ifdef CL_VERSION_1_1
typedef struct _cl_buffer_region {
size_t origin;
size_t size;
} cl_buffer_region;
#endif
#ifdef CL_VERSION_3_0
#define CL_NAME_VERSION_MAX_NAME_SIZE 64
typedef struct _cl_name_version {
cl_version version;
char name[CL_NAME_VERSION_MAX_NAME_SIZE];
} cl_name_version;
#endif
/******************************************************************************/
/* Error Codes */
#define CL_SUCCESS 0
#define CL_DEVICE_NOT_FOUND -1
#define CL_DEVICE_NOT_AVAILABLE -2
#define CL_COMPILER_NOT_AVAILABLE -3
#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4
#define CL_OUT_OF_RESOURCES -5
#define CL_OUT_OF_HOST_MEMORY -6
#define CL_PROFILING_INFO_NOT_AVAILABLE -7
#define CL_MEM_COPY_OVERLAP -8
#define CL_IMAGE_FORMAT_MISMATCH -9
#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10
#define CL_BUILD_PROGRAM_FAILURE -11
#define CL_MAP_FAILURE -12
#ifdef CL_VERSION_1_1
#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13
#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
#endif
#ifdef CL_VERSION_1_2
#define CL_COMPILE_PROGRAM_FAILURE -15
#define CL_LINKER_NOT_AVAILABLE -16
#define CL_LINK_PROGRAM_FAILURE -17
#define CL_DEVICE_PARTITION_FAILED -18
#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19
#endif
#define CL_INVALID_VALUE -30
#define CL_INVALID_DEVICE_TYPE -31
#define CL_INVALID_PLATFORM -32
#define CL_INVALID_DEVICE -33
#define CL_INVALID_CONTEXT -34
#define CL_INVALID_QUEUE_PROPERTIES -35
#define CL_INVALID_COMMAND_QUEUE -36
#define CL_INVALID_HOST_PTR -37
#define CL_INVALID_MEM_OBJECT -38
#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39
#define CL_INVALID_IMAGE_SIZE -40
#define CL_INVALID_SAMPLER -41
#define CL_INVALID_BINARY -42
#define CL_INVALID_BUILD_OPTIONS -43
#define CL_INVALID_PROGRAM -44
#define CL_INVALID_PROGRAM_EXECUTABLE -45
#define CL_INVALID_KERNEL_NAME -46
#define CL_INVALID_KERNEL_DEFINITION -47
#define CL_INVALID_KERNEL -48
#define CL_INVALID_ARG_INDEX -49
#define CL_INVALID_ARG_VALUE -50
#define CL_INVALID_ARG_SIZE -51
#define CL_INVALID_KERNEL_ARGS -52
#define CL_INVALID_WORK_DIMENSION -53
#define CL_INVALID_WORK_GROUP_SIZE -54
#define CL_INVALID_WORK_ITEM_SIZE -55
#define CL_INVALID_GLOBAL_OFFSET -56
#define CL_INVALID_EVENT_WAIT_LIST -57
#define CL_INVALID_EVENT -58
#define CL_INVALID_OPERATION -59
#define CL_INVALID_GL_OBJECT -60
#define CL_INVALID_BUFFER_SIZE -61
#define CL_INVALID_MIP_LEVEL -62
#define CL_INVALID_GLOBAL_WORK_SIZE -63
#ifdef CL_VERSION_1_1
#define CL_INVALID_PROPERTY -64
#endif
#ifdef CL_VERSION_1_2
#define CL_INVALID_IMAGE_DESCRIPTOR -65
#define CL_INVALID_COMPILER_OPTIONS -66
#define CL_INVALID_LINKER_OPTIONS -67
#define CL_INVALID_DEVICE_PARTITION_COUNT -68
#endif
#ifdef CL_VERSION_2_0
#define CL_INVALID_PIPE_SIZE -69
#define CL_INVALID_DEVICE_QUEUE -70
#endif
#ifdef CL_VERSION_2_2
#define CL_INVALID_SPEC_ID -71
#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72
#endif
/* cl_bool */
#define CL_FALSE 0
#define CL_TRUE 1
#ifdef CL_VERSION_1_2
#define CL_BLOCKING CL_TRUE
#define CL_NON_BLOCKING CL_FALSE
#endif
/* cl_platform_info */
#define CL_PLATFORM_PROFILE 0x0900
#define CL_PLATFORM_VERSION 0x0901
#define CL_PLATFORM_NAME 0x0902
#define CL_PLATFORM_VENDOR 0x0903
#define CL_PLATFORM_EXTENSIONS 0x0904
#ifdef CL_VERSION_2_1
#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905
#endif
#ifdef CL_VERSION_3_0
#define CL_PLATFORM_NUMERIC_VERSION 0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION 0x0907
#endif
/* cl_device_type - bitfield */
#define CL_DEVICE_TYPE_DEFAULT (1 << 0)
#define CL_DEVICE_TYPE_CPU (1 << 1)
#define CL_DEVICE_TYPE_GPU (1 << 2)
#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3)
#ifdef CL_VERSION_1_2
#define CL_DEVICE_TYPE_CUSTOM (1 << 4)
#endif
#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF
/* cl_device_info */
#define CL_DEVICE_TYPE 0x1000
#define CL_DEVICE_VENDOR_ID 0x1001
#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002
#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003
#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004
#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B
#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C
#define CL_DEVICE_ADDRESS_BITS 0x100D
#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E
#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F
#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010
#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011
#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012
#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013
#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014
#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015
#define CL_DEVICE_IMAGE_SUPPORT 0x1016
#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017
#define CL_DEVICE_MAX_SAMPLERS 0x1018
#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019
#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A
#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B
#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C
#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D
#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E
#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F
#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020
#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021
#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022
#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023
#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024
#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025
#define CL_DEVICE_ENDIAN_LITTLE 0x1026
#define CL_DEVICE_AVAILABLE 0x1027
#define CL_DEVICE_COMPILER_AVAILABLE 0x1028
#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029
#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */
#ifdef CL_VERSION_2_0
#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A
#endif
#define CL_DEVICE_NAME 0x102B
#define CL_DEVICE_VENDOR 0x102C
#define CL_DRIVER_VERSION 0x102D
#define CL_DEVICE_PROFILE 0x102E
#define CL_DEVICE_VERSION 0x102F
#define CL_DEVICE_EXTENSIONS 0x1030
#define CL_DEVICE_PLATFORM 0x1031
#ifdef CL_VERSION_1_2
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
#endif
/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
#ifdef CL_VERSION_1_1
#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034
#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B
#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
#ifdef CL_VERSION_1_2
#define CL_DEVICE_LINKER_AVAILABLE 0x103E
#define CL_DEVICE_BUILT_IN_KERNELS 0x103F
#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040
#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041
#define CL_DEVICE_PARENT_DEVICE 0x1042
#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043
#define CL_DEVICE_PARTITION_PROPERTIES 0x1044
#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045
#define CL_DEVICE_PARTITION_TYPE 0x1046
#define CL_DEVICE_REFERENCE_COUNT 0x1047
#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048
#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049
#endif
#ifdef CL_VERSION_2_0
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B
#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C
#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D
#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E
#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F
#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050
#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051
#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052
#define CL_DEVICE_SVM_CAPABILITIES 0x1053
#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054
#define CL_DEVICE_MAX_PIPE_ARGS 0x1055
#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056
#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057
#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058
#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059
#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A
#endif
#ifdef CL_VERSION_2_1
#define CL_DEVICE_IL_VERSION 0x105B
#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C
#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
#endif
#ifdef CL_VERSION_3_0
#define CL_DEVICE_NUMERIC_VERSION 0x105E
#define CL_DEVICE_EXTENSIONS_WITH_VERSION 0x1060
#define CL_DEVICE_ILS_WITH_VERSION 0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION 0x1062
#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES 0x1063
#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064
#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT 0x1065
#define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x1067
#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068
#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069
/* 0x106A to 0x106E - Reserved for upcoming KHR extension */
#define CL_DEVICE_OPENCL_C_FEATURES 0x106F
#define CL_DEVICE_DEVICE_ENQUEUE_SUPPORT 0x1070
#define CL_DEVICE_PIPE_SUPPORT 0x1071
#endif
/* cl_device_fp_config - bitfield */
#define CL_FP_DENORM (1 << 0)
#define CL_FP_INF_NAN (1 << 1)
#define CL_FP_ROUND_TO_NEAREST (1 << 2)
#define CL_FP_ROUND_TO_ZERO (1 << 3)
#define CL_FP_ROUND_TO_INF (1 << 4)
#define CL_FP_FMA (1 << 5)
#ifdef CL_VERSION_1_1
#define CL_FP_SOFT_FLOAT (1 << 6)
#endif
#ifdef CL_VERSION_1_2
#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7)
#endif
/* cl_device_mem_cache_type */
#define CL_NONE 0x0
#define CL_READ_ONLY_CACHE 0x1
#define CL_READ_WRITE_CACHE 0x2
/* cl_device_local_mem_type */
#define CL_LOCAL 0x1
#define CL_GLOBAL 0x2
/* cl_device_exec_capabilities - bitfield */
#define CL_EXEC_KERNEL (1 << 0)
#define CL_EXEC_NATIVE_KERNEL (1 << 1)
/* cl_command_queue_properties - bitfield */
#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0)
#define CL_QUEUE_PROFILING_ENABLE (1 << 1)
#ifdef CL_VERSION_2_0
#define CL_QUEUE_ON_DEVICE (1 << 2)
#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3)
#endif
/* cl_context_info */
#define CL_CONTEXT_REFERENCE_COUNT 0x1080
#define CL_CONTEXT_DEVICES 0x1081
#define CL_CONTEXT_PROPERTIES 0x1082
#ifdef CL_VERSION_1_1
#define CL_CONTEXT_NUM_DEVICES 0x1083
#endif
/* cl_context_properties */
#define CL_CONTEXT_PLATFORM 0x1084
#ifdef CL_VERSION_1_2
#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085
#endif
#ifdef CL_VERSION_1_2
/* cl_device_partition_property */
#define CL_DEVICE_PARTITION_EQUALLY 0x1086
#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087
#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088
#endif
#ifdef CL_VERSION_1_2
/* cl_device_affinity_domain */
#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0)
#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1)
#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2)
#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3)
#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4)
#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
#endif
#ifdef CL_VERSION_2_0
/* cl_device_svm_capabilities */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2)
#define CL_DEVICE_SVM_ATOMICS (1 << 3)
#endif
/* cl_command_queue_info */
#define CL_QUEUE_CONTEXT 0x1090
#define CL_QUEUE_DEVICE 0x1091
#define CL_QUEUE_REFERENCE_COUNT 0x1092
#define CL_QUEUE_PROPERTIES 0x1093
#ifdef CL_VERSION_2_0
#define CL_QUEUE_SIZE 0x1094
#endif
#ifdef CL_VERSION_2_1
#define CL_QUEUE_DEVICE_DEFAULT 0x1095
#endif
#ifdef CL_VERSION_3_0
#define CL_QUEUE_PROPERTIES_ARRAY 0x1098
#endif
/* cl_mem_flags and cl_svm_mem_flags - bitfield */
#define CL_MEM_READ_WRITE (1 << 0)
#define CL_MEM_WRITE_ONLY (1 << 1)
#define CL_MEM_READ_ONLY (1 << 2)
#define CL_MEM_USE_HOST_PTR (1 << 3)
#define CL_MEM_ALLOC_HOST_PTR (1 << 4)
#define CL_MEM_COPY_HOST_PTR (1 << 5)
/* reserved (1 << 6) */
#ifdef CL_VERSION_1_2
#define CL_MEM_HOST_WRITE_ONLY (1 << 7)
#define CL_MEM_HOST_READ_ONLY (1 << 8)
#define CL_MEM_HOST_NO_ACCESS (1 << 9)
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */
#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */
#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12)
#endif
#ifdef CL_VERSION_1_2
/* cl_mem_migration_flags - bitfield */
#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0)
#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1)
#endif
/* cl_channel_order */
#define CL_R 0x10B0
#define CL_A 0x10B1
#define CL_RG 0x10B2
#define CL_RA 0x10B3
#define CL_RGB 0x10B4
#define CL_RGBA 0x10B5
#define CL_BGRA 0x10B6
#define CL_ARGB 0x10B7
#define CL_INTENSITY 0x10B8
#define CL_LUMINANCE 0x10B9
#ifdef CL_VERSION_1_1
#define CL_Rx 0x10BA
#define CL_RGx 0x10BB
#define CL_RGBx 0x10BC
#endif
#ifdef CL_VERSION_1_2
#define CL_DEPTH 0x10BD
#define CL_DEPTH_STENCIL 0x10BE
#endif
#ifdef CL_VERSION_2_0
#define CL_sRGB 0x10BF
#define CL_sRGBx 0x10C0
#define CL_sRGBA 0x10C1
#define CL_sBGRA 0x10C2
#define CL_ABGR 0x10C3
#endif
/* cl_channel_type */
#define CL_SNORM_INT8 0x10D0
#define CL_SNORM_INT16 0x10D1
#define CL_UNORM_INT8 0x10D2
#define CL_UNORM_INT16 0x10D3
#define CL_UNORM_SHORT_565 0x10D4
#define CL_UNORM_SHORT_555 0x10D5
#define CL_UNORM_INT_101010 0x10D6
#define CL_SIGNED_INT8 0x10D7
#define CL_SIGNED_INT16 0x10D8
#define CL_SIGNED_INT32 0x10D9
#define CL_UNSIGNED_INT8 0x10DA
#define CL_UNSIGNED_INT16 0x10DB
#define CL_UNSIGNED_INT32 0x10DC
#define CL_HALF_FLOAT 0x10DD
#define CL_FLOAT 0x10DE
#ifdef CL_VERSION_1_2
#define CL_UNORM_INT24 0x10DF
#endif
#ifdef CL_VERSION_2_1
#define CL_UNORM_INT_101010_2 0x10E0
#endif
/* cl_mem_object_type */
#define CL_MEM_OBJECT_BUFFER 0x10F0
#define CL_MEM_OBJECT_IMAGE2D 0x10F1
#define CL_MEM_OBJECT_IMAGE3D 0x10F2
#ifdef CL_VERSION_1_2
#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3
#define CL_MEM_OBJECT_IMAGE1D 0x10F4
#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5
#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_OBJECT_PIPE 0x10F7
#endif
/* cl_mem_info */
#define CL_MEM_TYPE 0x1100
#define CL_MEM_FLAGS 0x1101
#define CL_MEM_SIZE 0x1102
#define CL_MEM_HOST_PTR 0x1103
#define CL_MEM_MAP_COUNT 0x1104
#define CL_MEM_REFERENCE_COUNT 0x1105
#define CL_MEM_CONTEXT 0x1106
#ifdef CL_VERSION_1_1
#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107
#define CL_MEM_OFFSET 0x1108
#endif
#ifdef CL_VERSION_2_0
#define CL_MEM_USES_SVM_POINTER 0x1109
#endif
#ifdef CL_VERSION_3_0
#define CL_MEM_PROPERTIES 0x110A
#endif
/* cl_image_info */
#define CL_IMAGE_FORMAT 0x1110
#define CL_IMAGE_ELEMENT_SIZE 0x1111
#define CL_IMAGE_ROW_PITCH 0x1112
#define CL_IMAGE_SLICE_PITCH 0x1113
#define CL_IMAGE_WIDTH 0x1114
#define CL_IMAGE_HEIGHT 0x1115
#define CL_IMAGE_DEPTH 0x1116
#ifdef CL_VERSION_1_2
#define CL_IMAGE_ARRAY_SIZE 0x1117
#define CL_IMAGE_BUFFER 0x1118
#define CL_IMAGE_NUM_MIP_LEVELS 0x1119
#define CL_IMAGE_NUM_SAMPLES 0x111A
#endif
/* cl_pipe_info */
#ifdef CL_VERSION_2_0
#define CL_PIPE_PACKET_SIZE 0x1120
#define CL_PIPE_MAX_PACKETS 0x1121
#endif
#ifdef CL_VERSION_3_0
#define CL_PIPE_PROPERTIES 0x1122
#endif
/* cl_addressing_mode */
#define CL_ADDRESS_NONE 0x1130
#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131
#define CL_ADDRESS_CLAMP 0x1132
#define CL_ADDRESS_REPEAT 0x1133
#ifdef CL_VERSION_1_1
#define CL_ADDRESS_MIRRORED_REPEAT 0x1134
#endif
/* cl_filter_mode */
#define CL_FILTER_NEAREST 0x1140
#define CL_FILTER_LINEAR 0x1141
/* cl_sampler_info */
#define CL_SAMPLER_REFERENCE_COUNT 0x1150
#define CL_SAMPLER_CONTEXT 0x1151
#define CL_SAMPLER_NORMALIZED_COORDS 0x1152
#define CL_SAMPLER_ADDRESSING_MODE 0x1153
#define CL_SAMPLER_FILTER_MODE 0x1154
#ifdef CL_VERSION_2_0
/* These enumerants are for the cl_khr_mipmap_image extension.
They have since been added to cl_ext.h with an appropriate
KHR suffix, but are left here for backwards compatibility. */
#define CL_SAMPLER_MIP_FILTER_MODE 0x1155
#define CL_SAMPLER_LOD_MIN 0x1156
#define CL_SAMPLER_LOD_MAX 0x1157
#endif
#ifdef CL_VERSION_3_0
#define CL_SAMPLER_PROPERTIES 0x1158
#endif
/* cl_map_flags - bitfield */
#define CL_MAP_READ (1 << 0)
#define CL_MAP_WRITE (1 << 1)
#ifdef CL_VERSION_1_2
#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)
#endif
/* cl_program_info */
#define CL_PROGRAM_REFERENCE_COUNT 0x1160
#define CL_PROGRAM_CONTEXT 0x1161
#define CL_PROGRAM_NUM_DEVICES 0x1162
#define CL_PROGRAM_DEVICES 0x1163
#define CL_PROGRAM_SOURCE 0x1164
#define CL_PROGRAM_BINARY_SIZES 0x1165
#define CL_PROGRAM_BINARIES 0x1166
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_NUM_KERNELS 0x1167
#define CL_PROGRAM_KERNEL_NAMES 0x1168
#endif
#ifdef CL_VERSION_2_1
#define CL_PROGRAM_IL 0x1169
#endif
#ifdef CL_VERSION_2_2
#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A
#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B
#endif
/* cl_program_build_info */
#define CL_PROGRAM_BUILD_STATUS 0x1181
#define CL_PROGRAM_BUILD_OPTIONS 0x1182
#define CL_PROGRAM_BUILD_LOG 0x1183
#ifdef CL_VERSION_1_2
#define CL_PROGRAM_BINARY_TYPE 0x1184
#endif
#ifdef CL_VERSION_2_0
#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
#endif
#ifdef CL_VERSION_1_2
/* cl_program_binary_type */
#define CL_PROGRAM_BINARY_TYPE_NONE 0x0
#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1
#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2
#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4
#endif
/* cl_build_status */
#define CL_BUILD_SUCCESS 0
#define CL_BUILD_NONE -1
#define CL_BUILD_ERROR -2
#define CL_BUILD_IN_PROGRESS -3
/* cl_kernel_info */
#define CL_KERNEL_FUNCTION_NAME 0x1190
#define CL_KERNEL_NUM_ARGS 0x1191
#define CL_KERNEL_REFERENCE_COUNT 0x1192
#define CL_KERNEL_CONTEXT 0x1193
#define CL_KERNEL_PROGRAM 0x1194
#ifdef CL_VERSION_1_2
#define CL_KERNEL_ATTRIBUTES 0x1195
#endif
#ifdef CL_VERSION_1_2
/* cl_kernel_arg_info */
#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196
#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197
#define CL_KERNEL_ARG_TYPE_NAME 0x1198
#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199
#define CL_KERNEL_ARG_NAME 0x119A
#endif
#ifdef CL_VERSION_1_2
/* cl_kernel_arg_address_qualifier */
#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B
#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C
#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D
#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E
#endif
#ifdef CL_VERSION_1_2
/* cl_kernel_arg_access_qualifier */
#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0
#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1
#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2
#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3
#endif
#ifdef CL_VERSION_1_2
/* cl_kernel_arg_type_qualifier */
#define CL_KERNEL_ARG_TYPE_NONE 0
#define CL_KERNEL_ARG_TYPE_CONST (1 << 0)
#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1)
#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2)
#ifdef CL_VERSION_2_0
#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3)
#endif
#endif
/* cl_kernel_work_group_info */
#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0
#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1
#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2
#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4
#ifdef CL_VERSION_1_2
#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5
#endif
#ifdef CL_VERSION_2_1
/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034
#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8
#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9
#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA
#endif
#ifdef CL_VERSION_2_0
/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7
#endif
/* cl_event_info */
#define CL_EVENT_COMMAND_QUEUE 0x11D0
#define CL_EVENT_COMMAND_TYPE 0x11D1
#define CL_EVENT_REFERENCE_COUNT 0x11D2
#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3
#ifdef CL_VERSION_1_1
#define CL_EVENT_CONTEXT 0x11D4
#endif
/* cl_command_type */
#define CL_COMMAND_NDRANGE_KERNEL 0x11F0
#define CL_COMMAND_TASK 0x11F1
#define CL_COMMAND_NATIVE_KERNEL 0x11F2
#define CL_COMMAND_READ_BUFFER 0x11F3
#define CL_COMMAND_WRITE_BUFFER 0x11F4
#define CL_COMMAND_COPY_BUFFER 0x11F5
#define CL_COMMAND_READ_IMAGE 0x11F6
#define CL_COMMAND_WRITE_IMAGE 0x11F7
#define CL_COMMAND_COPY_IMAGE 0x11F8
#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9
#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA
#define CL_COMMAND_MAP_BUFFER 0x11FB
#define CL_COMMAND_MAP_IMAGE 0x11FC
#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD
#define CL_COMMAND_MARKER 0x11FE
#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF
#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200
#ifdef CL_VERSION_1_1
#define CL_COMMAND_READ_BUFFER_RECT 0x1201
#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202
#define CL_COMMAND_COPY_BUFFER_RECT 0x1203
#define CL_COMMAND_USER 0x1204
#endif
#ifdef CL_VERSION_1_2
#define CL_COMMAND_BARRIER 0x1205
#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206
#define CL_COMMAND_FILL_BUFFER 0x1207
#define CL_COMMAND_FILL_IMAGE 0x1208
#endif
#ifdef CL_VERSION_2_0
#define CL_COMMAND_SVM_FREE 0x1209
#define CL_COMMAND_SVM_MEMCPY 0x120A
#define CL_COMMAND_SVM_MEMFILL 0x120B
#define CL_COMMAND_SVM_MAP 0x120C
#define CL_COMMAND_SVM_UNMAP 0x120D
#endif
#ifdef CL_VERSION_3_0
#define CL_COMMAND_SVM_MIGRATE_MEM 0x120E
#endif
/* command execution status */
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
/* cl_buffer_create_type */
#ifdef CL_VERSION_1_1
#define CL_BUFFER_CREATE_TYPE_REGION 0x1220
#endif
/* cl_profiling_info */
#define CL_PROFILING_COMMAND_QUEUED 0x1280
#define CL_PROFILING_COMMAND_SUBMIT 0x1281
#define CL_PROFILING_COMMAND_START 0x1282
#define CL_PROFILING_COMMAND_END 0x1283
#ifdef CL_VERSION_2_0
#define CL_PROFILING_COMMAND_COMPLETE 0x1284
#endif
/* cl_device_atomic_capabilities - bitfield */
#ifdef CL_VERSION_3_0
#define CL_DEVICE_ATOMIC_ORDER_RELAXED (1 << 0)
#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL (1 << 1)
#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST (1 << 2)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM (1 << 3)
#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP (1 << 4)
#define CL_DEVICE_ATOMIC_SCOPE_DEVICE (1 << 5)
#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES (1 << 6)
#endif
/* cl_khronos_vendor_id */
#define CL_KHRONOS_VENDOR_ID_CODEPLAY 0x10004
#ifdef CL_VERSION_3_0
/* cl_version */
#define CL_VERSION_MAJOR_BITS (10)
#define CL_VERSION_MINOR_BITS (10)
#define CL_VERSION_PATCH_BITS (12)
#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)
#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)
#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)
#define CL_VERSION_MAJOR(version) \
((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))
#define CL_VERSION_MINOR(version) \
(((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)
#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)
#define CL_MAKE_VERSION(major, minor, patch) \
((((major) & CL_VERSION_MAJOR_MASK) \
<< (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) | \
(((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \
((patch) & CL_VERSION_PATCH_MASK))
#endif
/********************************************************************************************************/
/* Platform API */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformIDs(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPlatformInfo(cl_platform_id platform,
cl_platform_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Device APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDs(cl_platform_id platform,
cl_device_type device_type,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceInfo(cl_device_id device,
cl_device_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevices(cl_device_id in_device,
const cl_device_partition_property * properties,
cl_uint num_devices,
cl_device_id * out_devices,
cl_uint * num_devices_ret) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_VERSION_2_1
extern CL_API_ENTRY cl_int CL_API_CALL
clSetDefaultDeviceCommandQueue(cl_context context,
cl_device_id device,
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceAndHostTimer(cl_device_id device,
cl_ulong* device_timestamp,
cl_ulong* host_timestamp) CL_API_SUFFIX__VERSION_2_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetHostTimer(cl_device_id device,
cl_ulong * host_timestamp) CL_API_SUFFIX__VERSION_2_1;
#endif
/* Context APIs */
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContext(const cl_context_properties * properties,
cl_uint num_devices,
const cl_device_id * devices,
void (CL_CALLBACK * pfn_notify)(const char * errinfo,
const void * private_info,
size_t cb,
void * user_data),
void * user_data,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_context CL_API_CALL
clCreateContextFromType(const cl_context_properties * properties,
cl_device_type device_type,
void (CL_CALLBACK * pfn_notify)(const char * errinfo,
const void * private_info,
size_t cb,
void * user_data),
void * user_data,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetContextInfo(cl_context context,
cl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Command Queue APIs */
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithProperties(cl_context context,
cl_device_id device,
const cl_queue_properties * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetCommandQueueInfo(cl_command_queue command_queue,
cl_command_queue_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Memory Object APIs */
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBuffer(cl_context context,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateSubBuffer(cl_mem buffer,
cl_mem_flags flags,
cl_buffer_create_type buffer_create_type,
const void * buffer_create_info,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1;
#endif
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImage(cl_context context,
cl_mem_flags flags,
const cl_image_format * image_format,
const cl_image_desc * image_desc,
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreatePipe(cl_context context,
cl_mem_flags flags,
cl_uint pipe_packet_size,
cl_uint pipe_max_packets,
const cl_pipe_properties * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0;
#endif
#ifdef CL_VERSION_3_0
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferWithProperties(cl_context context,
const cl_mem_properties * properties,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateImageWithProperties(cl_context context,
const cl_mem_properties * properties,
cl_mem_flags flags,
const cl_image_format * image_format,
const cl_image_desc * image_desc,
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSupportedImageFormats(cl_context context,
cl_mem_flags flags,
cl_mem_object_type image_type,
cl_uint num_entries,
cl_image_format * image_formats,
cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemObjectInfo(cl_mem memobj,
cl_mem_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetImageInfo(cl_mem image,
cl_image_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_int CL_API_CALL
clGetPipeInfo(cl_mem pipe,
cl_pipe_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;
#endif
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_int CL_API_CALL
clSetMemObjectDestructorCallback(cl_mem memobj,
void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
void * user_data),
void * user_data) CL_API_SUFFIX__VERSION_1_1;
#endif
/* SVM Allocation APIs */
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY void * CL_API_CALL
clSVMAlloc(cl_context context,
cl_svm_mem_flags flags,
size_t size,
cl_uint alignment) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY void CL_API_CALL
clSVMFree(cl_context context,
void * svm_pointer) CL_API_SUFFIX__VERSION_2_0;
#endif
/* Sampler APIs */
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_sampler CL_API_CALL
clCreateSamplerWithProperties(cl_context context,
const cl_sampler_properties * sampler_properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetSamplerInfo(cl_sampler sampler,
cl_sampler_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context context,
cl_uint count,
const char ** strings,
const size_t * lengths,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBinary(cl_context context,
cl_uint num_devices,
const cl_device_id * device_list,
const size_t * lengths,
const unsigned char ** binaries,
cl_int * binary_status,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithBuiltInKernels(cl_context context,
cl_uint num_devices,
const cl_device_id * device_list,
const char * kernel_names,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_VERSION_2_1
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithIL(cl_context context,
const void* il,
size_t length,
cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clBuildProgram(cl_program program,
cl_uint num_devices,
const cl_device_id * device_list,
const char * options,
void (CL_CALLBACK * pfn_notify)(cl_program program,
void * user_data),
void * user_data) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clCompileProgram(cl_program program,
cl_uint num_devices,
const cl_device_id * device_list,
const char * options,
cl_uint num_input_headers,
const cl_program * input_headers,
const char ** header_include_names,
void (CL_CALLBACK * pfn_notify)(cl_program program,
void * user_data),
void * user_data) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_program CL_API_CALL
clLinkProgram(cl_context context,
cl_uint num_devices,
const cl_device_id * device_list,
const char * options,
cl_uint num_input_programs,
const cl_program * input_programs,
void (CL_CALLBACK * pfn_notify)(cl_program program,
void * user_data),
void * user_data,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_VERSION_2_2
extern CL_API_ENTRY cl_int CL_API_CALL
clSetProgramReleaseCallback(cl_program program,
void (CL_CALLBACK * pfn_notify)(cl_program program,
void * user_data),
void * user_data) CL_API_SUFFIX__VERSION_2_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetProgramSpecializationConstant(cl_program program,
cl_uint spec_id,
size_t spec_size,
const void* spec_value) CL_API_SUFFIX__VERSION_2_2;
#endif
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramInfo(cl_program program,
cl_program_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetProgramBuildInfo(cl_program program,
cl_device_id device,
cl_program_build_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Kernel Object APIs */
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCreateKernel(cl_program program,
const char * kernel_name,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateKernelsInProgram(cl_program program,
cl_uint num_kernels,
cl_kernel * kernels,
cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_1
extern CL_API_ENTRY cl_kernel CL_API_CALL
clCloneKernel(cl_kernel source_kernel,
cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArg(cl_kernel kernel,
cl_uint arg_index,
size_t arg_size,
const void * arg_value) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointer(cl_kernel kernel,
cl_uint arg_index,
const void * arg_value) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfo(cl_kernel kernel,
cl_kernel_exec_info param_name,
size_t param_value_size,
const void * param_value) CL_API_SUFFIX__VERSION_2_0;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelInfo(cl_kernel kernel,
cl_kernel_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelArgInfo(cl_kernel kernel,
cl_uint arg_indx,
cl_kernel_arg_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelWorkGroupInfo(cl_kernel kernel,
cl_device_id device,
cl_kernel_work_group_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_1
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfo(cl_kernel kernel,
cl_device_id device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void* input_value,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
#endif
/* Event Object APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clWaitForEvents(cl_uint num_events,
const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventInfo(cl_event event,
cl_event_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateUserEvent(cl_context context,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_int CL_API_CALL
clSetUserEventStatus(cl_event event,
cl_int execution_status) CL_API_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetEventCallback(cl_event event,
cl_int command_exec_callback_type,
void (CL_CALLBACK * pfn_notify)(cl_event event,
cl_int event_command_status,
void * user_data),
void * user_data) CL_API_SUFFIX__VERSION_1_1;
#endif
/* Profiling APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clGetEventProfilingInfo(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
/* Flush and Finish APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
/* Enqueued Commands APIs */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBuffer(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_read,
size_t offset,
size_t size,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadBufferRect(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_read,
const size_t * buffer_offset,
const size_t * host_offset,
const size_t * region,
size_t buffer_row_pitch,
size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_1;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBuffer(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_write,
size_t offset,
size_t size,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteBufferRect(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_write,
const size_t * buffer_offset,
const size_t * host_offset,
const size_t * region,
size_t buffer_row_pitch,
size_t buffer_slice_pitch,
size_t host_row_pitch,
size_t host_slice_pitch,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_1;
#endif
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillBuffer(cl_command_queue command_queue,
cl_mem buffer,
const void * pattern,
size_t pattern_size,
size_t offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBuffer(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_buffer,
size_t src_offset,
size_t dst_offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferRect(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_buffer,
const size_t * src_origin,
const size_t * dst_origin,
const size_t * region,
size_t src_row_pitch,
size_t src_slice_pitch,
size_t dst_row_pitch,
size_t dst_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_1;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReadImage(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_read,
const size_t * origin,
const size_t * region,
size_t row_pitch,
size_t slice_pitch,
void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueWriteImage(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_write,
const size_t * origin,
const size_t * region,
size_t input_row_pitch,
size_t input_slice_pitch,
const void * ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueFillImage(cl_command_queue command_queue,
cl_mem image,
const void * fill_color,
const size_t * origin,
const size_t * region,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImage(cl_command_queue command_queue,
cl_mem src_image,
cl_mem dst_image,
const size_t * src_origin,
const size_t * dst_origin,
const size_t * region,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
cl_mem src_image,
cl_mem dst_buffer,
const size_t * src_origin,
const size_t * region,
size_t dst_offset,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueCopyBufferToImage(cl_command_queue command_queue,
cl_mem src_buffer,
cl_mem dst_image,
size_t src_offset,
const size_t * dst_origin,
const size_t * region,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapBuffer(cl_command_queue command_queue,
cl_mem buffer,
cl_bool blocking_map,
cl_map_flags map_flags,
size_t offset,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY void * CL_API_CALL
clEnqueueMapImage(cl_command_queue command_queue,
cl_mem image,
cl_bool blocking_map,
cl_map_flags map_flags,
const size_t * origin,
const size_t * region,
size_t * image_row_pitch,
size_t * image_slice_pitch,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueUnmapMemObject(cl_command_queue command_queue,
cl_mem memobj,
void * mapped_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjects(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNDRangeKernel(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint work_dim,
const size_t * global_work_offset,
const size_t * global_work_size,
const size_t * local_work_size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueNativeKernel(cl_command_queue command_queue,
void (CL_CALLBACK * user_func)(void *),
void * args,
size_t cb_args,
cl_uint num_mem_objects,
const cl_mem * mem_list,
const void ** args_mem_loc,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMarkerWithWaitList(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueBarrierWithWaitList(cl_command_queue command_queue,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_VERSION_2_0
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFree(cl_command_queue command_queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void * user_data),
void * user_data,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpy(cl_command_queue command_queue,
cl_bool blocking_copy,
void * dst_ptr,
const void * src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFill(cl_command_queue command_queue,
void * svm_ptr,
const void * pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMap(cl_command_queue command_queue,
cl_bool blocking_map,
cl_map_flags flags,
void * svm_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmap(cl_command_queue command_queue,
void * svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_0;
#endif
#ifdef CL_VERSION_2_1
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMigrateMem(cl_command_queue command_queue,
cl_uint num_svm_pointers,
const void ** svm_pointers,
const size_t * sizes,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_2_1;
#endif
#ifdef CL_VERSION_1_2
/* Extension function access
*
* Returns the extension function address for the given function name,
* or NULL if a valid function can not be found. The client must
* check to make sure the address is not NULL, before using or
* calling the returned function address.
*/
extern CL_API_ENTRY void * CL_API_CALL
clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
const char * func_name) CL_API_SUFFIX__VERSION_1_2;
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
/*
* WARNING:
* This API introduces mutable state into the OpenCL implementation. It has been REMOVED
* to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the
* OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
* It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
*
* Software developers previously relying on this API are instructed to set the command queue
* properties when creating the queue, instead.
*/
extern CL_API_ENTRY cl_int CL_API_CALL
clSetCommandQueueProperty(cl_command_queue command_queue,
cl_command_queue_properties properties,
cl_bool enable,
cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage2D(cl_context context,
cl_mem_flags flags,
const cl_image_format * image_format,
size_t image_width,
size_t image_height,
size_t image_row_pitch,
void * host_ptr,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateImage3D(cl_context context,
cl_mem_flags flags,
const cl_image_format * image_format,
size_t image_width,
size_t image_height,
size_t image_depth,
size_t image_row_pitch,
size_t image_slice_pitch,
void * host_ptr,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueMarker(cl_command_queue command_queue,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueWaitForEvents(cl_command_queue command_queue,
cl_uint num_events,
const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
/* Deprecated OpenCL 2.0 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL
clCreateCommandQueue(cl_context context,
cl_device_id device,
cl_command_queue_properties properties,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL
clCreateSampler(cl_context context,
cl_bool normalized_coords,
cl_addressing_mode addressing_mode,
cl_filter_mode filter_mode,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL
clEnqueueTask(cl_command_queue command_queue,
cl_kernel kernel,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d10.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D10_H
#define __OPENCL_CL_D3D10_H
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d10_sharing */
#define cl_khr_d3d10_sharing 1
typedef cl_uint cl_d3d10_device_source_khr;
typedef cl_uint cl_d3d10_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D10_DEVICE_KHR -1002
#define CL_INVALID_D3D10_RESOURCE_KHR -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
/* cl_d3d10_device_source_nv */
#define CL_D3D10_DEVICE_KHR 0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
/* cl_d3d10_device_set_nv */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
cl_platform_id platform,
cl_d3d10_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D10_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d11.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_D3D11_H
#define __OPENCL_CL_D3D11_H
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d11_sharing */
#define cl_khr_d3d11_sharing 1
typedef cl_uint cl_d3d11_device_source_khr;
typedef cl_uint cl_d3d11_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D11_DEVICE_KHR -1006
#define CL_INVALID_D3D11_RESOURCE_KHR -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
/* cl_d3d11_device_source */
#define CL_D3D11_DEVICE_KHR 0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
/* cl_d3d11_device_set */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
cl_platform_id platform,
cl_d3d11_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D11_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
/* cl_khr_dx9_media_sharing */
#define cl_khr_dx9_media_sharing 1
typedef cl_uint cl_dx9_media_adapter_type_khr;
typedef cl_uint cl_dx9_media_adapter_set_khr;
#if defined(_WIN32)
#include
typedef struct _cl_dx9_surface_info_khr
{
IDirect3DSurface9 *resource;
HANDLE shared_handle;
} cl_dx9_surface_info_khr;
#endif
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR 0x2020
#define CL_ADAPTER_D3D9EX_KHR 0x2021
#define CL_ADAPTER_DXVA_KHR 0x2022
/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
cl_platform_id platform,
cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr * media_adapter_type,
void * media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
cl_context context,
cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type,
void * surface_info,
cl_uint plane,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing_intel.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_dx9_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#include
#include
#include
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_dx9_media_sharing extension *
****************************************/
#define cl_intel_dx9_media_sharing 1
typedef cl_uint cl_dx9_device_source_intel;
typedef cl_uint cl_dx9_device_set_intel;
/* error codes */
#define CL_INVALID_DX9_DEVICE_INTEL -1010
#define CL_INVALID_DX9_RESOURCE_INTEL -1011
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013
/* cl_dx9_device_source_intel */
#define CL_D3D9_DEVICE_INTEL 0x4022
#define CL_D3D9EX_DEVICE_INTEL 0x4070
#define CL_DXVA_DEVICE_INTEL 0x4071
/* cl_dx9_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024
#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025
/* cl_context_info */
#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072
#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073
/* cl_mem_info */
#define CL_MEM_DX9_RESOURCE_INTEL 0x4027
#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074
/* cl_image_info */
#define CL_IMAGE_DX9_PLANE_INTEL 0x4075
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B
/******************************************************************************/
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_egl.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_EGL_H
#define __OPENCL_CL_EGL_H
#include
#ifdef __cplusplus
extern "C" {
#endif
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void* CLeglImageKHR;
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void* CLeglDisplayKHR;
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void* CLeglSyncKHR;
/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t cl_egl_image_properties_khr;
#define cl_khr_egl_image 1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
#define cl_khr_egl_event 1
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_EGL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_ext.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies. */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include
/* cl_khr_fp64 extension - no extension #define since it has no functions */
/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
#if CL_TARGET_OPENCL_VERSION <= 110
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
#endif
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* Memory object destruction
*
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
*
* Registers a user callback function that will be called when the memory object is deleted and its resources
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
* stack associated with memobj. The registered user callback functions are called in the reverse order in
* which they were registered. The user callback functions are called and then the memory object is deleted
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
* the storage bits for the memory object, can be reused or freed.
*
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
*
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*/
#define cl_APPLE_SetMemObjectDestructor 1
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj,
void (* pfn_notify)(cl_mem memobj, void * user_data),
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* Context Logging Functions
*
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*
* clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
*/
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
/*******************************
* cl_khr_il_program extension *
*******************************/
#define cl_khr_il_program 1
/* New property to clGetDeviceInfo for retrieving supported intermediate
* languages
*/
#define CL_DEVICE_IL_VERSION_KHR 0x105B
/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
* program
*/
#define CL_PROGRAM_IL_KHR 0x1169
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithILKHR(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret);
typedef CL_API_ENTRY cl_program
(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/* Extension: cl_khr_image2d_from_buffer
*
* This extension allows a 2D image to be created from a cl_mem buffer without
* a copy. The type associated with a 2D image created from a buffer in an
* OpenCL program is image2d_t. Both the sampler and sampler-less read_image
* built-in functions are supported for 2D images and 2D images created from
* a buffer. Similarly, the write_image built-ins are also supported for 2D
* images created from a buffer.
*
* When the 2D image from buffer is created, the client must specify the
* width, height, image format (i.e. channel order and channel data type)
* and optionally the row pitch.
*
* The pitch specified must be a multiple of
* CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
* The base address of the buffer must be aligned to
* CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
*/
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B
/**************************************
* cl_khr_initialize_memory extension *
**************************************/
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030
/**************************************
* cl_khr_terminate_context extension *
**************************************/
#define CL_CONTEXT_TERMINATED_KHR -1121
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031
#define CL_CONTEXT_TERMINATE_KHR 0x2032
#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL
clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
/*
* Extension: cl_khr_spir
*
* This extension adds support to create an OpenCL program object from a
* Standard Portable Intermediate Representation (SPIR) instance
*/
#define CL_DEVICE_SPIR_VERSIONS 0x40E0
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
/*****************************************
* cl_khr_create_command_queue extension *
*****************************************/
#define cl_khr_create_command_queue 1
typedef cl_bitfield cl_queue_properties_khr;
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_command_queue
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_nv_device_attribute_query extension *
******************************************/
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
/*********************************
* cl_amd_device_attribute_query *
*********************************/
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
#define CL_DEVICE_TOPOLOGY_AMD 0x4037
#define CL_DEVICE_BOARD_NAME_AMD 0x4038
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049
#define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A
#define CL_DEVICE_GFXIP_MINOR_AMD 0x404B
#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C
#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030
#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031
#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033
#define CL_DEVICE_PCIE_ID_AMD 0x4034
/*********************************
* cl_arm_printf extension
*********************************/
#define CL_PRINTF_CALLBACK_ARM 0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
/***********************************
* cl_ext_device_fission extension
***********************************/
#define cl_ext_device_fission 1
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef cl_ulong cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
#define CL_INVALID_PARTITION_COUNT_EXT -1058
#define CL_INVALID_PARTITION_NAME_EXT -1059
/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
/***********************************
* cl_ext_migrate_memobject extension definitions
***********************************/
#define cl_ext_migrate_memobject 1
typedef cl_bitfield cl_mem_migration_flags_ext;
#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1
#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
/*********************************
* cl_qcom_ext_host_ptr extension
*********************************/
#define cl_qcom_ext_host_ptr 1
#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
typedef cl_uint cl_image_pitch_info_qcom;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(cl_device_id device,
size_t image_width,
size_t image_height,
const cl_image_format *image_format,
cl_image_pitch_info_qcom param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
typedef struct _cl_mem_ext_host_ptr
{
/* Type of external memory allocation. */
/* Legal values will be defined in layered extensions. */
cl_uint allocation_type;
/* Host cache policy for this external memory allocation. */
cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;
/*******************************************
* cl_qcom_ext_host_ptr_iocoherent extension
********************************************/
/* Cache policy specifying io-coherence */
#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9
/*********************************
* cl_qcom_ion_host_ptr extension
*********************************/
#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
typedef struct _cl_mem_ion_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
cl_mem_ext_host_ptr ext_host_ptr;
/* ION file descriptor */
int ion_filedesc;
/* Host pointer to the ION allocated memory */
void* ion_hostptr;
} cl_mem_ion_host_ptr;
/*********************************
* cl_qcom_android_native_buffer_host_ptr extension
*********************************/
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6
typedef struct _cl_mem_android_native_buffer_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
cl_mem_ext_host_ptr ext_host_ptr;
/* Virtual pointer to the android native buffer */
void* anb_ptr;
} cl_mem_android_native_buffer_host_ptr;
/******************************************
* cl_img_yuv_image extension *
******************************************/
/* Image formats used in clCreateImage */
#define CL_NV21_IMG 0x40D0
#define CL_YV12_IMG 0x40D1
/******************************************
* cl_img_cached_allocations extension *
******************************************/
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26)
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27)
/******************************************
* cl_img_use_gralloc_ptr extension *
******************************************/
#define cl_img_use_gralloc_ptr 1
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28)
/* To be used by clGetEventInfo: */
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3
/* Error code from clEnqueueReleaseGrallocObjectsIMG */
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
/*********************************
* cl_khr_subgroups extension
*********************************/
#define cl_khr_subgroups 1
#if !defined(CL_VERSION_2_1)
/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
In hindsight, there should have been a khr suffix on this type for
the extension, but keeping it un-suffixed to maintain backwards
compatibility. */
typedef cl_uint cl_kernel_sub_group_info;
#endif
/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
/*********************************
* cl_khr_mipmap_image extension
*********************************/
/* cl_sampler_properties */
#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155
#define CL_SAMPLER_LOD_MIN_KHR 0x1156
#define CL_SAMPLER_LOD_MAX_KHR 0x1157
/*********************************
* cl_khr_priority_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_priority_hints 1
typedef cl_uint cl_queue_priority_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_PRIORITY_KHR 0x1096
/* cl_queue_priority_khr */
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
/*********************************
* cl_khr_throttle_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_throttle_hints 1
typedef cl_uint cl_queue_throttle_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_THROTTLE_KHR 0x1097
/* cl_queue_throttle_khr */
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
/*********************************
* cl_khr_subgroup_named_barrier
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_subgroup_named_barrier 1
/* cl_device_info */
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035
/*********************************
* cl_khr_extended_versioning
*********************************/
#define cl_khr_extended_versioning 1
#define CL_VERSION_MAJOR_BITS_KHR (10)
#define CL_VERSION_MINOR_BITS_KHR (10)
#define CL_VERSION_PATCH_BITS_KHR (12)
#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
#define CL_MAKE_VERSION_KHR(major, minor, patch) \
((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
(((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
((patch) & CL_VERSION_PATCH_MASK_KHR))
typedef cl_uint cl_version_khr;
#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
typedef struct _cl_name_version_khr
{
cl_version_khr version;
char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
} cl_name_version_khr;
/* cl_platform_info */
#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906
#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907
/* cl_device_info */
#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E
#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F
#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060
#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061
#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062
/*********************************
* cl_khr_device_uuid extension
*********************************/
#define cl_khr_device_uuid 1
#define CL_UUID_SIZE_KHR 16
#define CL_LUID_SIZE_KHR 8
#define CL_DEVICE_UUID_KHR 0x106A
#define CL_DRIVER_UUID_KHR 0x106B
#define CL_DEVICE_LUID_VALID_KHR 0x106C
#define CL_DEVICE_LUID_KHR 0x106D
#define CL_DEVICE_NODE_MASK_KHR 0x106E
/**********************************
* cl_arm_import_memory extension *
**********************************/
#define cl_arm_import_memory 1
typedef intptr_t cl_import_properties_arm;
/* Default and valid proporties name for cl_arm_import_memory */
#define CL_IMPORT_TYPE_ARM 0x40B2
/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_HOST_ARM 0x40B3
/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4
/* Protected memory property */
#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5
/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
/* Data consistency with host property */
#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
/* Import memory size value to indicate a size for the whole buffer */
#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
/* This extension adds a new function that allows for direct memory import into
* OpenCL via the clImportMemoryARM function.
*
* Memory imported through this interface will be mapped into the device's page
* tables directly, providing zero copy access. It will never fall back to copy
* operations and aliased buffers.
*
* Types of memory supported for import are specified as additional extension
* strings.
*
* This extension produces cl_mem allocations which are compatible with all other
* users of cl_mem in the standard API.
*
* This extension maps pages with the same properties as the normal buffer creation
* function clCreateBuffer.
*/
extern CL_API_ENTRY cl_mem CL_API_CALL
clImportMemoryARM( cl_context context,
cl_mem_flags flags,
const cl_import_properties_arm *properties,
void *memory,
size_t size,
cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
/******************************************
* cl_arm_shared_virtual_memory extension *
******************************************/
#define cl_arm_shared_virtual_memory 1
/* Used by clGetDeviceInfo */
#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6
/* Used by clGetMemObjectInfo */
#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7
/* Used by clSetKernelExecInfoARM: */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9
/* To be used by clGetEventInfo: */
#define CL_COMMAND_SVM_FREE_ARM 0x40BA
#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB
#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC
#define CL_COMMAND_SVM_MAP_ARM 0x40BD
#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE
/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2)
#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3)
/* Flag values used by clSVMAllocARM: */
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10)
#define CL_MEM_SVM_ATOMICS_ARM (1 << 11)
typedef cl_bitfield cl_svm_mem_flags_arm;
typedef cl_uint cl_kernel_exec_info_arm;
typedef cl_bitfield cl_device_svm_capabilities_arm;
extern CL_API_ENTRY void * CL_API_CALL
clSVMAllocARM(cl_context context,
cl_svm_mem_flags_arm flags,
size_t size,
cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY void CL_API_CALL
clSVMFreeARM(cl_context context,
void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFreeARM(cl_command_queue command_queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void * user_data),
void * user_data,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpyARM(cl_command_queue command_queue,
cl_bool blocking_copy,
void * dst_ptr,
const void * src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFillARM(cl_command_queue command_queue,
void * svm_ptr,
const void * pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMapARM(cl_command_queue command_queue,
cl_bool blocking_map,
cl_map_flags flags,
void * svm_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmapARM(cl_command_queue command_queue,
void * svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointerARM(cl_kernel kernel,
cl_uint arg_index,
const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfoARM(cl_kernel kernel,
cl_kernel_exec_info_arm param_name,
size_t param_value_size,
const void * param_value) CL_EXT_SUFFIX__VERSION_1_2;
/********************************
* cl_arm_get_core_id extension *
********************************/
#ifdef CL_VERSION_1_2
#define cl_arm_get_core_id 1
/* Device info property for bitfield of cores present */
#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF
#endif /* CL_VERSION_1_2 */
/*********************************
* cl_arm_job_slot_selection
*********************************/
#define cl_arm_job_slot_selection 1
/* cl_device_info */
#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0
/* cl_command_queue_properties */
#define CL_QUEUE_JOB_SLOT_ARM 0x41E1
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_ext_intel.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2020 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_ext_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __CL_EXT_INTEL_H
#define __CL_EXT_INTEL_H
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_thread_local_exec extension *
****************************************/
#define cl_intel_thread_local_exec 1
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31)
/***********************************************
* cl_intel_device_partition_by_names extension *
************************************************/
#define cl_intel_device_partition_by_names 1
#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1
/************************************************
* cl_intel_accelerator extension *
* cl_intel_motion_estimation extension *
* cl_intel_advanced_motion_estimation extension *
*************************************************/
#define cl_intel_accelerator 1
#define cl_intel_motion_estimation 1
#define cl_intel_advanced_motion_estimation 1
typedef struct _cl_accelerator_intel* cl_accelerator_intel;
typedef cl_uint cl_accelerator_type_intel;
typedef cl_uint cl_accelerator_info_intel;
typedef struct _cl_motion_estimation_desc_intel {
cl_uint mb_block_type;
cl_uint subpixel_mode;
cl_uint sad_adjust_mode;
cl_uint search_path_type;
} cl_motion_estimation_desc_intel;
/* error codes */
#define CL_INVALID_ACCELERATOR_INTEL -1094
#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
/* cl_accelerator_type_intel */
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0
/* cl_accelerator_info_intel */
#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091
#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092
#define CL_ACCELERATOR_TYPE_INTEL 0x4093
/* cl_motion_detect_desc_intel flags */
#define CL_ME_MB_TYPE_16x16_INTEL 0x0
#define CL_ME_MB_TYPE_8x8_INTEL 0x1
#define CL_ME_MB_TYPE_4x4_INTEL 0x2
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5
#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4
#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1
#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3
#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48
#define CL_ME_COST_PENALTY_NONE_INTEL 0x0
#define CL_ME_COST_PENALTY_LOW_INTEL 0x1
#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2
#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3
#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
/* cl_device_info */
#define CL_DEVICE_ME_VERSION_INTEL 0x407E
#define CL_ME_VERSION_LEGACY_INTEL 0x0
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2
extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetAcceleratorInfoINTEL(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_intel_simultaneous_sharing extension *
*******************************************/
#define cl_intel_simultaneous_sharing 1
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105
/***********************************
* cl_intel_egl_image_yuv extension *
************************************/
#define cl_intel_egl_image_yuv 1
#define CL_EGL_YUV_PLANE_INTEL 0x4107
/********************************
* cl_intel_packed_yuv extension *
*********************************/
#define cl_intel_packed_yuv 1
#define CL_YUYV_INTEL 0x4076
#define CL_UYVY_INTEL 0x4077
#define CL_YVYU_INTEL 0x4078
#define CL_VYUY_INTEL 0x4079
/********************************************
* cl_intel_required_subgroup_size extension *
*********************************************/
#define cl_intel_required_subgroup_size 1
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
/****************************************
* cl_intel_driver_diagnostics extension *
*****************************************/
#define cl_intel_driver_diagnostics 1
typedef cl_uint cl_diagnostics_verbose_level;
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 )
/********************************
* cl_intel_planar_yuv extension *
*********************************/
#define CL_NV12_INTEL 0x410E
#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 )
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 )
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F
/*******************************************************
* cl_intel_device_side_avc_motion_estimation extension *
********************************************************/
#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D
#define CL_AVC_ME_VERSION_0_INTEL 0x0 /* No support. */
#define CL_AVC_ME_VERSION_1_INTEL 0x1 /* First supported version. */
#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CL_AVC_ME_MINOR_8x8_INTEL 0x0
#define CL_AVC_ME_MINOR_8x4_INTEL 0x1
#define CL_AVC_ME_MINOR_4x8_INTEL 0x2
#define CL_AVC_ME_MINOR_4x4_INTEL 0x3
#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 )
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CL_AVC_ME_INTRA_16x16_INTEL 0x0
#define CL_AVC_ME_INTRA_8x8_INTEL 0x1
#define CL_AVC_ME_INTRA_4x4_INTEL 0x2
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
/*******************************************
* cl_intel_unified_shared_memory extension *
********************************************/
/* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */
#define cl_intel_unified_shared_memory 1
/* cl_device_info */
#define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190
#define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191
#define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192
#define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193
#define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194
typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel;
/* cl_device_unified_shared_memory_capabilities_intel - bitfield */
#define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0)
#define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2)
#define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3)
typedef cl_bitfield cl_mem_properties_intel;
/* cl_mem_properties_intel */
#define CL_MEM_ALLOC_FLAGS_INTEL 0x4195
typedef cl_bitfield cl_mem_alloc_flags_intel;
/* cl_mem_alloc_flags_intel - bitfield */
#define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0)
typedef cl_uint cl_mem_info_intel;
/* cl_mem_alloc_info_intel */
#define CL_MEM_ALLOC_TYPE_INTEL 0x419A
#define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B
#define CL_MEM_ALLOC_SIZE_INTEL 0x419C
#define CL_MEM_ALLOC_DEVICE_INTEL 0x419D
/* Enum values 0x419E-0x419F are reserved for future queries. */
typedef cl_uint cl_unified_shared_memory_type_intel;
/* cl_unified_shared_memory_type_intel */
#define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196
#define CL_MEM_TYPE_HOST_INTEL 0x4197
#define CL_MEM_TYPE_DEVICE_INTEL 0x4198
#define CL_MEM_TYPE_SHARED_INTEL 0x4199
typedef cl_uint cl_mem_advice_intel;
/* cl_mem_advice_intel */
/* Enum values 0x4208-0x420F are reserved for future memory advices. */
/* cl_kernel_exec_info */
#define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200
#define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201
#define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202
#define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203
/* cl_command_type */
#define CL_COMMAND_MEMFILL_INTEL 0x4204
#define CL_COMMAND_MEMCPY_INTEL 0x4205
#define CL_COMMAND_MIGRATEMEM_INTEL 0x4206
#define CL_COMMAND_MEMADVISE_INTEL 0x4207
extern CL_API_ENTRY void* CL_API_CALL
clHostMemAllocINTEL(
cl_context context,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clHostMemAllocINTEL_fn)(
cl_context context,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY void* CL_API_CALL
clDeviceMemAllocINTEL(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clDeviceMemAllocINTEL_fn)(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY void* CL_API_CALL
clSharedMemAllocINTEL(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
typedef CL_API_ENTRY void* (CL_API_CALL *
clSharedMemAllocINTEL_fn)(
cl_context context,
cl_device_id device,
const cl_mem_properties_intel* properties,
size_t size,
cl_uint alignment,
cl_int* errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clMemFreeINTEL(
cl_context context,
void* ptr);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clMemFreeINTEL_fn)(
cl_context context,
void* ptr);
extern CL_API_ENTRY cl_int CL_API_CALL
clGetMemAllocInfoINTEL(
cl_context context,
const void* ptr,
cl_mem_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clGetMemAllocInfoINTEL_fn)(
cl_context context,
const void* ptr,
cl_mem_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgMemPointerINTEL(
cl_kernel kernel,
cl_uint arg_index,
const void* arg_value);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clSetKernelArgMemPointerINTEL_fn)(
cl_kernel kernel,
cl_uint arg_index,
const void* arg_value);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemsetINTEL( /* Deprecated */
cl_command_queue command_queue,
void* dst_ptr,
cl_int value,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemsetINTEL_fn)( /* Deprecated */
cl_command_queue command_queue,
void* dst_ptr,
cl_int value,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemFillINTEL(
cl_command_queue command_queue,
void* dst_ptr,
const void* pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemFillINTEL_fn)(
cl_command_queue command_queue,
void* dst_ptr,
const void* pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemcpyINTEL(
cl_command_queue command_queue,
cl_bool blocking,
void* dst_ptr,
const void* src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemcpyINTEL_fn)(
cl_command_queue command_queue,
cl_bool blocking,
void* dst_ptr,
const void* src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#ifdef CL_VERSION_1_2
/* Because these APIs use cl_mem_migration_flags, they require
OpenCL 1.2: */
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemINTEL(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMigrateMemINTEL_fn)(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#endif
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMemAdviseINTEL(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_advice_intel advice,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
typedef CL_API_ENTRY cl_int (CL_API_CALL *
clEnqueueMemAdviseINTEL_fn)(
cl_command_queue command_queue,
const void* ptr,
size_t size,
cl_mem_advice_intel advice,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event);
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_INTEL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_gl.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#include
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
#ifdef CL_VERSION_1_2
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
#define CL_GL_OBJECT_TEXTURE1D 0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
#endif
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
#ifdef CL_VERSION_1_2
#define CL_GL_NUM_SAMPLES 0x2012
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint bufobj,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint renderbuffer,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem memobj,
cl_gl_object_type * gl_object_type,
cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem memobj,
cl_gl_texture_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include
/*
* cl_khr_gl_event extension
*/
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context context,
cl_GLsync cl_GLsync,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_EXT_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_half.h
================================================
/*******************************************************************************
* Copyright (c) 2019-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/**
* This is a header-only utility library that provides OpenCL host code with
* routines for converting to/from cl_half values.
*
* Example usage:
*
* #include
* ...
* cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE);
* cl_float f = cl_half_to_float(h);
*/
#ifndef OPENCL_CL_HALF_H
#define OPENCL_CL_HALF_H
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/**
* Rounding mode used when converting to cl_half.
*/
typedef enum
{
CL_HALF_RTE, // round to nearest even
CL_HALF_RTZ, // round towards zero
CL_HALF_RTP, // round towards positive infinity
CL_HALF_RTN, // round towards negative infinity
} cl_half_rounding_mode;
/* Private utility macros. */
#define CL_HALF_EXP_MASK 0x7C00
#define CL_HALF_MAX_FINITE_MAG 0x7BFF
/*
* Utility to deal with values that overflow when converting to half precision.
*/
static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode,
uint16_t sign)
{
if (rounding_mode == CL_HALF_RTZ)
{
// Round overflow towards zero -> largest finite number (preserving sign)
return (sign << 15) | CL_HALF_MAX_FINITE_MAG;
}
else if (rounding_mode == CL_HALF_RTP && sign)
{
// Round negative overflow towards positive infinity -> most negative finite number
return (1 << 15) | CL_HALF_MAX_FINITE_MAG;
}
else if (rounding_mode == CL_HALF_RTN && !sign)
{
// Round positive overflow towards negative infinity -> largest finite number
return CL_HALF_MAX_FINITE_MAG;
}
// Overflow to infinity
return (sign << 15) | CL_HALF_EXP_MASK;
}
/*
* Utility to deal with values that underflow when converting to half precision.
*/
static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode,
uint16_t sign)
{
if (rounding_mode == CL_HALF_RTP && !sign)
{
// Round underflow towards positive infinity -> smallest positive value
return (sign << 15) | 1;
}
else if (rounding_mode == CL_HALF_RTN && sign)
{
// Round underflow towards negative infinity -> largest negative value
return (sign << 15) | 1;
}
// Flush to zero
return (sign << 15);
}
/**
* Convert a cl_float to a cl_half.
*/
static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode)
{
// Type-punning to get direct access to underlying bits
union
{
cl_float f;
uint32_t i;
} f32;
f32.f = f;
// Extract sign bit
uint16_t sign = f32.i >> 31;
// Extract FP32 exponent and mantissa
uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF;
uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1);
// Remove FP32 exponent bias
int32_t exp = f_exp - CL_FLT_MAX_EXP + 1;
// Add FP16 exponent bias
uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1;
// Position of the bit that will become the FP16 mantissa LSB
uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG;
// Check for NaN / infinity
if (f_exp == 0xFF)
{
if (f_mant)
{
// NaN -> propagate mantissa and silence it
uint16_t h_mant = f_mant >> lsb_pos;
h_mant |= 0x200;
return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
}
else
{
// Infinity -> zero mantissa
return (sign << 15) | CL_HALF_EXP_MASK;
}
}
// Check for zero
if (!f_exp && !f_mant)
{
return (sign << 15);
}
// Check for overflow
if (exp >= CL_HALF_MAX_EXP)
{
return cl_half_handle_overflow(rounding_mode, sign);
}
// Check for underflow
if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
{
return cl_half_handle_underflow(rounding_mode, sign);
}
// Check for value that will become denormal
if (exp < -14)
{
// Denormal -> include the implicit 1 from the FP32 mantissa
h_exp = 0;
f_mant |= 1 << (CL_FLT_MANT_DIG - 1);
// Mantissa shift amount depends on exponent
lsb_pos = -exp + (CL_FLT_MANT_DIG - 25);
}
// Generate FP16 mantissa by shifting FP32 mantissa
uint16_t h_mant = f_mant >> lsb_pos;
// Check whether we need to round
uint32_t halfway = 1 << (lsb_pos - 1);
uint32_t mask = (halfway << 1) - 1;
switch (rounding_mode)
{
case CL_HALF_RTE:
if ((f_mant & mask) > halfway)
{
// More than halfway -> round up
h_mant += 1;
}
else if ((f_mant & mask) == halfway)
{
// Exactly halfway -> round to nearest even
if (h_mant & 0x1)
h_mant += 1;
}
break;
case CL_HALF_RTZ:
// Mantissa has already been truncated -> do nothing
break;
case CL_HALF_RTP:
if ((f_mant & mask) && !sign)
{
// Round positive numbers up
h_mant += 1;
}
break;
case CL_HALF_RTN:
if ((f_mant & mask) && sign)
{
// Round negative numbers down
h_mant += 1;
}
break;
}
// Check for mantissa overflow
if (h_mant & 0x400)
{
h_exp += 1;
h_mant = 0;
}
return (sign << 15) | (h_exp << 10) | h_mant;
}
/**
* Convert a cl_double to a cl_half.
*/
static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode)
{
// Type-punning to get direct access to underlying bits
union
{
cl_double d;
uint64_t i;
} f64;
f64.d = d;
// Extract sign bit
uint16_t sign = f64.i >> 63;
// Extract FP64 exponent and mantissa
uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF;
uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1);
// Remove FP64 exponent bias
int64_t exp = d_exp - CL_DBL_MAX_EXP + 1;
// Add FP16 exponent bias
uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1);
// Position of the bit that will become the FP16 mantissa LSB
uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG;
// Check for NaN / infinity
if (d_exp == 0x7FF)
{
if (d_mant)
{
// NaN -> propagate mantissa and silence it
uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
h_mant |= 0x200;
return (sign << 15) | CL_HALF_EXP_MASK | h_mant;
}
else
{
// Infinity -> zero mantissa
return (sign << 15) | CL_HALF_EXP_MASK;
}
}
// Check for zero
if (!d_exp && !d_mant)
{
return (sign << 15);
}
// Check for overflow
if (exp >= CL_HALF_MAX_EXP)
{
return cl_half_handle_overflow(rounding_mode, sign);
}
// Check for underflow
if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1))
{
return cl_half_handle_underflow(rounding_mode, sign);
}
// Check for value that will become denormal
if (exp < -14)
{
// Include the implicit 1 from the FP64 mantissa
h_exp = 0;
d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1);
// Mantissa shift amount depends on exponent
lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25));
}
// Generate FP16 mantissa by shifting FP64 mantissa
uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos);
// Check whether we need to round
uint64_t halfway = (uint64_t)1 << (lsb_pos - 1);
uint64_t mask = (halfway << 1) - 1;
switch (rounding_mode)
{
case CL_HALF_RTE:
if ((d_mant & mask) > halfway)
{
// More than halfway -> round up
h_mant += 1;
}
else if ((d_mant & mask) == halfway)
{
// Exactly halfway -> round to nearest even
if (h_mant & 0x1)
h_mant += 1;
}
break;
case CL_HALF_RTZ:
// Mantissa has already been truncated -> do nothing
break;
case CL_HALF_RTP:
if ((d_mant & mask) && !sign)
{
// Round positive numbers up
h_mant += 1;
}
break;
case CL_HALF_RTN:
if ((d_mant & mask) && sign)
{
// Round negative numbers down
h_mant += 1;
}
break;
}
// Check for mantissa overflow
if (h_mant & 0x400)
{
h_exp += 1;
h_mant = 0;
}
return (sign << 15) | (h_exp << 10) | h_mant;
}
/**
* Convert a cl_half to a cl_float.
*/
static inline cl_float cl_half_to_float(cl_half h)
{
// Type-punning to get direct access to underlying bits
union
{
cl_float f;
uint32_t i;
} f32;
// Extract sign bit
uint16_t sign = h >> 15;
// Extract FP16 exponent and mantissa
uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F;
uint16_t h_mant = h & 0x3FF;
// Remove FP16 exponent bias
int32_t exp = h_exp - CL_HALF_MAX_EXP + 1;
// Add FP32 exponent bias
uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1;
// Check for NaN / infinity
if (h_exp == 0x1F)
{
if (h_mant)
{
// NaN -> propagate mantissa and silence it
uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG);
f_mant |= 0x400000;
f32.i = (sign << 31) | 0x7F800000 | f_mant;
return f32.f;
}
else
{
// Infinity -> zero mantissa
f32.i = (sign << 31) | 0x7F800000;
return f32.f;
}
}
// Check for zero / denormal
if (h_exp == 0)
{
if (h_mant == 0)
{
// Zero -> zero exponent
f_exp = 0;
}
else
{
// Denormal -> normalize it
// - Shift mantissa to make most-significant 1 implicit
// - Adjust exponent accordingly
uint32_t shift = 0;
while ((h_mant & 0x400) == 0)
{
h_mant <<= 1;
shift++;
}
h_mant &= 0x3FF;
f_exp -= shift - 1;
}
}
f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13);
return f32.f;
}
#undef CL_HALF_EXP_MASK
#undef CL_HALF_MAX_FINITE_MAG
#ifdef __cplusplus
}
#endif
#endif /* OPENCL_CL_HALF_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_icd.h
================================================
/*******************************************************************************
* Copyright (c) 2019-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef OPENCL_CL_ICD_H
#define OPENCL_CL_ICD_H
#include
#include
#include
#include
#if defined(_WIN32)
#include
#include
#include
#endif
#ifdef __cplusplus
extern "C" {
#endif
/*
* This file contains pointer type definitions for each of the CL API calls as
* well as a type definition for the dispatch table used by the Khronos ICD
* loader (see cl_khr_icd extension specification for background).
*/
/* API function pointer definitions */
// Platform APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)(
cl_uint num_entries, cl_platform_id *platforms,
cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)(
cl_platform_id platform, cl_platform_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
// Device APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)(
cl_platform_id platform, cl_device_type device_type, cl_uint num_entries,
cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)(
cl_device_id device, cl_device_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)(
cl_device_id in_device,
const cl_device_partition_property *partition_properties,
cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)(
cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)(
cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clCreateSubDevices;
typedef void *cl_api_clRetainDevice;
typedef void *cl_api_clReleaseDevice;
#endif
// Context APIs
typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)(
const cl_context_properties *properties, cl_uint num_devices,
const cl_device_id *devices,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)(
const cl_context_properties *properties, cl_device_type device_type,
void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *),
void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)(
cl_context context) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)(
cl_context context) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)(
cl_context context, cl_context_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
// Command Queue APIs
typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)(
cl_context context, cl_device_id device,
cl_command_queue_properties properties,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
typedef CL_API_ENTRY
cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)(
cl_context /* context */, cl_device_id /* device */,
const cl_queue_properties * /* properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
#else
typedef void *cl_api_clCreateCommandQueueWithProperties;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)(
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)(
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)(
cl_command_queue command_queue, cl_command_queue_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
// Memory Object APIs
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)(
cl_context context, cl_mem_flags flags, size_t size, void *host_ptr,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)(
cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
const cl_image_desc *image_desc, void *host_ptr,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clCreateImage;
#endif
#ifdef CL_VERSION_3_0
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)(
cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
size_t size, void *host_ptr,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)(
cl_context context, const cl_mem_properties *properties, cl_mem_flags flags,
const cl_image_format *image_format, const cl_image_desc *image_desc,
void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0;
#else
typedef void *cl_api_clCreateBufferWithProperties;
typedef void *cl_api_clCreateImageWithProperties;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)(
cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)(
cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)(
cl_context context, cl_mem_flags flags, cl_mem_object_type image_type,
cl_uint num_entries, cl_image_format *image_formats,
cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)(
cl_mem memobj, cl_mem_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)(
cl_mem image, cl_image_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)(
cl_context /* context */, cl_mem_flags /* flags */,
cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */,
const cl_pipe_properties * /* properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)(
cl_mem /* pipe */, cl_pipe_info /* param_name */,
size_t /* param_value_size */, void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)(
cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */,
unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)(
cl_context /* context */,
void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
#else
typedef void *cl_api_clCreatePipe;
typedef void *cl_api_clGetPipeInfo;
typedef void *cl_api_clSVMAlloc;
typedef void *cl_api_clSVMFree;
#endif
// Sampler APIs
typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)(
cl_context context, cl_bool normalized_coords,
cl_addressing_mode addressing_mode, cl_filter_mode filter_mode,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)(
cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)(
cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)(
cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
typedef CL_API_ENTRY
cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)(
cl_context /* context */,
const cl_sampler_properties * /* sampler_properties */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
#else
typedef void *cl_api_clCreateSamplerWithProperties;
#endif
// Program Object APIs
typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)(
cl_context context, cl_uint count, const char **strings,
const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)(
cl_context context, cl_uint num_devices, const cl_device_id *device_list,
const size_t *lengths, const unsigned char **binaries,
cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY
cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)(
cl_context context, cl_uint num_devices, const cl_device_id *device_list,
const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clCreateProgramWithBuiltInKernels;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)(
cl_program program) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)(
cl_program program) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)(
cl_program program, cl_uint num_devices, const cl_device_id *device_list,
const char *options,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)(
cl_program program, cl_uint num_devices, const cl_device_id *device_list,
const char *options, cl_uint num_input_headers,
const cl_program *input_headers, const char **header_include_names,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)(
cl_context context, cl_uint num_devices, const cl_device_id *device_list,
const char *options, cl_uint num_input_programs,
const cl_program *input_programs,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clCompileProgram;
typedef void *cl_api_clLinkProgram;
#endif
#ifdef CL_VERSION_2_2
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)(
cl_program program, cl_uint spec_id, size_t spec_size,
const void *spec_value) CL_API_SUFFIX__VERSION_2_2;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)(
cl_program program,
void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
void *user_data) CL_API_SUFFIX__VERSION_2_2;
#else
typedef void *cl_api_clSetProgramSpecializationConstant;
typedef void *cl_api_clSetProgramReleaseCallback;
#endif
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)(
cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clUnloadPlatformCompiler;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)(
cl_program program, cl_program_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)(
cl_program program, cl_device_id device, cl_program_build_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
// Kernel Object APIs
typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)(
cl_program program, const char *kernel_name,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)(
cl_program program, cl_uint num_kernels, cl_kernel *kernels,
cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)(
cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)(
cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)(
cl_kernel kernel, cl_uint arg_index, size_t arg_size,
const void *arg_value) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)(
cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)(
cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clGetKernelArgInfo;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)(
cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_2_0
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)(
cl_kernel /* kernel */, cl_uint /* arg_index */,
const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)(
cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */,
size_t /* param_value_size */,
const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)(
cl_kernel /* in_kernel */, cl_device_id /*in_device*/,
cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/,
const void * /*input_value*/, size_t /*param_value_size*/,
void * /*param_value*/,
size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0;
#else
typedef void *cl_api_clSetKernelArgSVMPointer;
typedef void *cl_api_clSetKernelExecInfo;
typedef void *cl_api_clGetKernelSubGroupInfoKHR;
#endif
// Event Object APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)(
cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)(
cl_event event, cl_event_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event)
CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event)
CL_API_SUFFIX__VERSION_1_0;
// Profiling APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)(
cl_event event, cl_profiling_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
// Flush and Finish APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)(
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)(
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
// Enqueued Commands APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_1;
#else
typedef void *cl_api_clEnqueueReadBufferRect;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write,
size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read,
const size_t *buffer_origin, const size_t *host_origin,
const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch,
size_t host_row_pitch, size_t host_slice_pitch, const void *ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_1;
#else
typedef void *cl_api_clEnqueueWriteBufferRect;
#endif
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)(
cl_command_queue command_queue, cl_mem buffer, const void *pattern,
size_t pattern_size, size_t offset, size_t cb,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clEnqueueFillBuffer;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
size_t src_offset, size_t dst_offset, size_t cb,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch,
size_t dst_slice_pitch, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_1;
#else
typedef void *cl_api_clEnqueueCopyBufferRect;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_read,
const size_t *origin, const size_t *region, size_t row_pitch,
size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_write,
const size_t *origin, const size_t *region, size_t input_row_pitch,
size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)(
cl_command_queue command_queue, cl_mem image, const void *fill_color,
const size_t origin[3], const size_t region[3],
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clEnqueueFillImage;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)(
cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image,
const size_t *src_origin, const size_t *dst_origin, const size_t *region,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)(
cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer,
const size_t *src_origin, const size_t *region, size_t dst_offset,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)(
cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image,
size_t src_offset, const size_t *dst_origin, const size_t *region,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)(
cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map,
cl_map_flags map_flags, size_t offset, size_t cb,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)(
cl_command_queue command_queue, cl_mem image, cl_bool blocking_map,
cl_map_flags map_flags, const size_t *origin, const size_t *region,
size_t *image_row_pitch, size_t *image_slice_pitch,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)(
cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)(
cl_command_queue command_queue, cl_uint num_mem_objects,
const cl_mem *mem_objects, cl_mem_migration_flags flags,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clEnqueueMigrateMemObjects;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)(
cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim,
const size_t *global_work_offset, const size_t *global_work_size,
const size_t *local_work_size, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)(
cl_command_queue command_queue, cl_kernel kernel,
cl_uint num_events_in_wait_list, const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)(
cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *),
void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list,
const void **args_mem_loc, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)(
cl_command_queue command_queue, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)(
cl_command_queue command_queue, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY void *(
CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)(
cl_platform_id platform,
const char *function_name)CL_API_SUFFIX__VERSION_1_2;
#else
typedef void *cl_api_clEnqueueMarkerWithWaitList;
typedef void *cl_api_clEnqueueBarrierWithWaitList;
typedef void *cl_api_clGetExtensionFunctionAddressForPlatform;
#endif
// Shared Virtual Memory APIs
#ifdef CL_VERSION_2_0
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)(
cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
void ** /* svm_pointers */,
void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */,
cl_uint /* num_svm_pointers */,
void ** /* svm_pointers[] */,
void * /* user_data */),
void * /* user_data */, cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)(
cl_command_queue /* command_queue */, cl_bool /* blocking_copy */,
void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)(
cl_command_queue /* command_queue */, void * /* svm_ptr */,
const void * /* pattern */, size_t /* pattern_size */, size_t /* size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)(
cl_command_queue /* command_queue */, cl_bool /* blocking_map */,
cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)(
cl_command_queue /* command_queue */, void * /* svm_ptr */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
#else
typedef void *cl_api_clEnqueueSVMFree;
typedef void *cl_api_clEnqueueSVMMemcpy;
typedef void *cl_api_clEnqueueSVMMemFill;
typedef void *cl_api_clEnqueueSVMMap;
typedef void *cl_api_clEnqueueSVMUnmap;
#endif
// Deprecated APIs
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)(
cl_command_queue command_queue, cl_command_queue_properties properties,
cl_bool enable, cl_command_queue_properties *old_properties)
CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)(
cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
size_t image_width, size_t image_height, size_t image_row_pitch,
void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)(
cl_context context, cl_mem_flags flags, const cl_image_format *image_format,
size_t image_width, size_t image_height, size_t image_depth,
size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr,
cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void)
CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)(
cl_command_queue command_queue,
cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)(
cl_command_queue command_queue, cl_uint num_events,
const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)(
cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)(
const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
// GL and other APIs
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)(
cl_context context, cl_mem_flags flags, cl_GLuint bufobj,
int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)(
cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)(
cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)(
cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel,
cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)(
cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)(
cl_mem memobj, cl_gl_object_type *gl_object_type,
cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)(
cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
/* cl_khr_gl_sharing */
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)(
const cl_context_properties *properties, cl_gl_context_info param_name,
size_t param_value_size, void *param_value, size_t *param_value_size_ret);
/* cl_khr_gl_event */
typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)(
cl_context context, cl_GLsync sync, cl_int *errcode_ret);
#if defined(_WIN32)
/* cl_khr_d3d10_sharing */
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)(
cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries, cl_device_id *devices,
cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)(
cl_context context, cl_mem_flags flags, ID3D10Buffer *resource,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)(
cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)(
cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR(
cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source,
void *d3d_object, cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags,
ID3D10Buffer *resource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR(
cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource,
UINT subresource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR(
cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource,
UINT subresource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
/* cl_khr_d3d11_sharing */
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)(
cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries, cl_device_id *devices,
cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)(
cl_context context, cl_mem_flags flags, ID3D11Buffer *resource,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)(
cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)(
cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
/* cl_khr_dx9_media_sharing */
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)(
cl_platform_id platform, cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)(
cl_context context, cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_1_2;
/* cl_khr_d3d11_sharing */
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR(
cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source,
void *d3d_object, cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags,
ID3D11Buffer *resource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR(
cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource,
UINT subresource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR(
cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource,
UINT subresource, cl_int *errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
/* cl_khr_dx9_media_sharing */
extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR(
cl_platform_id platform, cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries,
cl_device_id *devices, cl_uint *num_devices);
extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR(
cl_context context, cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type, void *surface_info,
cl_uint plane, cl_int *errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
#else
/* cl_khr_d3d10_sharing */
typedef void *cl_api_clGetDeviceIDsFromD3D10KHR;
typedef void *cl_api_clCreateFromD3D10BufferKHR;
typedef void *cl_api_clCreateFromD3D10Texture2DKHR;
typedef void *cl_api_clCreateFromD3D10Texture3DKHR;
typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR;
typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR;
/* cl_khr_d3d11_sharing */
typedef void *cl_api_clGetDeviceIDsFromD3D11KHR;
typedef void *cl_api_clCreateFromD3D11BufferKHR;
typedef void *cl_api_clCreateFromD3D11Texture2DKHR;
typedef void *cl_api_clCreateFromD3D11Texture3DKHR;
typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR;
typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR;
/* cl_khr_dx9_media_sharing */
typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR;
typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR;
typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR;
typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR;
#endif
/* OpenCL 1.1 */
#ifdef CL_VERSION_1_1
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)(
cl_event /* event */, cl_int /* command_exec_callback_type */,
void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)(
cl_mem /* buffer */, cl_mem_flags /* flags */,
cl_buffer_create_type /* buffer_create_type */,
const void * /* buffer_create_info */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY
cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)(
cl_mem /* memobj */,
void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */,
void * /*user_data*/),
void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)(
cl_context /* context */,
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)(
cl_event /* event */,
cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
#else
typedef void *cl_api_clSetEventCallback;
typedef void *cl_api_clCreateSubBuffer;
typedef void *cl_api_clSetMemObjectDestructorCallback;
typedef void *cl_api_clCreateUserEvent;
typedef void *cl_api_clSetUserEventStatus;
#endif
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)(
cl_device_id in_device,
const cl_device_partition_property_ext *partition_properties,
cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices);
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)(
cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)(
cl_device_id device) CL_API_SUFFIX__VERSION_1_0;
/* cl_khr_egl_image */
typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)(
cl_context context, CLeglDisplayKHR display, CLeglImageKHR image,
cl_mem_flags flags, const cl_egl_image_properties_khr *properties,
cl_int *errcode_ret);
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)(
cl_command_queue command_queue, cl_uint num_objects,
const cl_mem *mem_objects, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list, cl_event *event);
/* cl_khr_egl_event */
typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)(
cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display,
cl_int *errcode_ret);
#ifdef CL_VERSION_2_1
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)(
cl_context context, cl_device_id device,
cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)(
cl_context context, const void *il, size_t length,
cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)(
cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name,
size_t input_value_size, const void *input_value, size_t param_value_size,
void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)(
cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)(
cl_command_queue command_queue, cl_uint num_svm_pointers,
const void **svm_pointers, const size_t *sizes,
cl_mem_migration_flags flags, cl_uint num_events_in_wait_list,
const cl_event *event_wait_list,
cl_event *event) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)(
cl_device_id device, cl_ulong *device_timestamp,
cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)(
cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1;
#else
typedef void *cl_api_clSetDefaultDeviceCommandQueue;
typedef void *cl_api_clCreateProgramWithIL;
typedef void *cl_api_clGetKernelSubGroupInfo;
typedef void *cl_api_clCloneKernel;
typedef void *cl_api_clEnqueueSVMMigrateMem;
typedef void *cl_api_clGetDeviceAndHostTimer;
typedef void *cl_api_clGetHostTimer;
#endif
/* Vendor dispatch table struture */
typedef struct _cl_icd_dispatch {
/* OpenCL 1.0 */
cl_api_clGetPlatformIDs clGetPlatformIDs;
cl_api_clGetPlatformInfo clGetPlatformInfo;
cl_api_clGetDeviceIDs clGetDeviceIDs;
cl_api_clGetDeviceInfo clGetDeviceInfo;
cl_api_clCreateContext clCreateContext;
cl_api_clCreateContextFromType clCreateContextFromType;
cl_api_clRetainContext clRetainContext;
cl_api_clReleaseContext clReleaseContext;
cl_api_clGetContextInfo clGetContextInfo;
cl_api_clCreateCommandQueue clCreateCommandQueue;
cl_api_clRetainCommandQueue clRetainCommandQueue;
cl_api_clReleaseCommandQueue clReleaseCommandQueue;
cl_api_clGetCommandQueueInfo clGetCommandQueueInfo;
cl_api_clSetCommandQueueProperty clSetCommandQueueProperty;
cl_api_clCreateBuffer clCreateBuffer;
cl_api_clCreateImage2D clCreateImage2D;
cl_api_clCreateImage3D clCreateImage3D;
cl_api_clRetainMemObject clRetainMemObject;
cl_api_clReleaseMemObject clReleaseMemObject;
cl_api_clGetSupportedImageFormats clGetSupportedImageFormats;
cl_api_clGetMemObjectInfo clGetMemObjectInfo;
cl_api_clGetImageInfo clGetImageInfo;
cl_api_clCreateSampler clCreateSampler;
cl_api_clRetainSampler clRetainSampler;
cl_api_clReleaseSampler clReleaseSampler;
cl_api_clGetSamplerInfo clGetSamplerInfo;
cl_api_clCreateProgramWithSource clCreateProgramWithSource;
cl_api_clCreateProgramWithBinary clCreateProgramWithBinary;
cl_api_clRetainProgram clRetainProgram;
cl_api_clReleaseProgram clReleaseProgram;
cl_api_clBuildProgram clBuildProgram;
cl_api_clUnloadCompiler clUnloadCompiler;
cl_api_clGetProgramInfo clGetProgramInfo;
cl_api_clGetProgramBuildInfo clGetProgramBuildInfo;
cl_api_clCreateKernel clCreateKernel;
cl_api_clCreateKernelsInProgram clCreateKernelsInProgram;
cl_api_clRetainKernel clRetainKernel;
cl_api_clReleaseKernel clReleaseKernel;
cl_api_clSetKernelArg clSetKernelArg;
cl_api_clGetKernelInfo clGetKernelInfo;
cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
cl_api_clWaitForEvents clWaitForEvents;
cl_api_clGetEventInfo clGetEventInfo;
cl_api_clRetainEvent clRetainEvent;
cl_api_clReleaseEvent clReleaseEvent;
cl_api_clGetEventProfilingInfo clGetEventProfilingInfo;
cl_api_clFlush clFlush;
cl_api_clFinish clFinish;
cl_api_clEnqueueReadBuffer clEnqueueReadBuffer;
cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer;
cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer;
cl_api_clEnqueueReadImage clEnqueueReadImage;
cl_api_clEnqueueWriteImage clEnqueueWriteImage;
cl_api_clEnqueueCopyImage clEnqueueCopyImage;
cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
cl_api_clEnqueueMapBuffer clEnqueueMapBuffer;
cl_api_clEnqueueMapImage clEnqueueMapImage;
cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
cl_api_clEnqueueTask clEnqueueTask;
cl_api_clEnqueueNativeKernel clEnqueueNativeKernel;
cl_api_clEnqueueMarker clEnqueueMarker;
cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents;
cl_api_clEnqueueBarrier clEnqueueBarrier;
cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
cl_api_clCreateFromGLBuffer clCreateFromGLBuffer;
cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D;
cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D;
cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer;
cl_api_clGetGLObjectInfo clGetGLObjectInfo;
cl_api_clGetGLTextureInfo clGetGLTextureInfo;
cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR;
/* cl_khr_d3d10_sharing */
cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR;
cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR;
cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR;
cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR;
cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR;
cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR;
/* OpenCL 1.1 */
cl_api_clSetEventCallback clSetEventCallback;
cl_api_clCreateSubBuffer clCreateSubBuffer;
cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
cl_api_clCreateUserEvent clCreateUserEvent;
cl_api_clSetUserEventStatus clSetUserEventStatus;
cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect;
cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
/* cl_ext_device_fission */
cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT;
cl_api_clRetainDeviceEXT clRetainDeviceEXT;
cl_api_clReleaseDeviceEXT clReleaseDeviceEXT;
/* cl_khr_gl_event */
cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR;
/* OpenCL 1.2 */
cl_api_clCreateSubDevices clCreateSubDevices;
cl_api_clRetainDevice clRetainDevice;
cl_api_clReleaseDevice clReleaseDevice;
cl_api_clCreateImage clCreateImage;
cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
cl_api_clCompileProgram clCompileProgram;
cl_api_clLinkProgram clLinkProgram;
cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler;
cl_api_clGetKernelArgInfo clGetKernelArgInfo;
cl_api_clEnqueueFillBuffer clEnqueueFillBuffer;
cl_api_clEnqueueFillImage clEnqueueFillImage;
cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
cl_api_clGetExtensionFunctionAddressForPlatform
clGetExtensionFunctionAddressForPlatform;
cl_api_clCreateFromGLTexture clCreateFromGLTexture;
/* cl_khr_d3d11_sharing */
cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR;
cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR;
cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR;
cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR;
cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR;
cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR;
cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR;
/* cl_khr_dx9_media_sharing */
cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR
clGetDeviceIDsFromDX9MediaAdapterKHR;
cl_api_clEnqueueAcquireDX9MediaSurfacesKHR
clEnqueueAcquireDX9MediaSurfacesKHR;
cl_api_clEnqueueReleaseDX9MediaSurfacesKHR
clEnqueueReleaseDX9MediaSurfacesKHR;
/* cl_khr_egl_image */
cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
/* cl_khr_egl_event */
cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
/* OpenCL 2.0 */
cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
cl_api_clCreatePipe clCreatePipe;
cl_api_clGetPipeInfo clGetPipeInfo;
cl_api_clSVMAlloc clSVMAlloc;
cl_api_clSVMFree clSVMFree;
cl_api_clEnqueueSVMFree clEnqueueSVMFree;
cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill;
cl_api_clEnqueueSVMMap clEnqueueSVMMap;
cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap;
cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties;
cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
cl_api_clSetKernelExecInfo clSetKernelExecInfo;
/* cl_khr_sub_groups */
cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR;
/* OpenCL 2.1 */
cl_api_clCloneKernel clCloneKernel;
cl_api_clCreateProgramWithIL clCreateProgramWithIL;
cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem;
cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer;
cl_api_clGetHostTimer clGetHostTimer;
cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo;
cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue;
/* OpenCL 2.2 */
cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback;
cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant;
/* OpenCL 3.0 */
cl_api_clCreateBufferWithProperties clCreateBufferWithProperties;
cl_api_clCreateImageWithProperties clCreateImageWithProperties;
} cl_icd_dispatch;
#ifdef __cplusplus
}
#endif
#endif /* #ifndef OPENCL_CL_ICD_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_platform.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __CL_PLATFORM_H
#define __CL_PLATFORM_H
#include
#ifdef __cplusplus
extern "C" {
#endif
#if defined(_WIN32)
#define CL_API_ENTRY
#define CL_API_CALL __stdcall
#define CL_CALLBACK __stdcall
#else
#define CL_API_ENTRY
#define CL_API_CALL
#define CL_CALLBACK
#endif
/*
* Deprecation flags refer to the last version of the header in which the
* feature was not deprecated.
*
* E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
* deprecation but is deprecated in versions later than 1.1.
*/
#define CL_EXTENSION_WEAK_LINK
#define CL_API_SUFFIX__VERSION_1_0
#define CL_EXT_SUFFIX__VERSION_1_0
#define CL_API_SUFFIX__VERSION_1_1
#define CL_EXT_SUFFIX__VERSION_1_1
#define CL_API_SUFFIX__VERSION_1_2
#define CL_EXT_SUFFIX__VERSION_1_2
#define CL_API_SUFFIX__VERSION_2_0
#define CL_EXT_SUFFIX__VERSION_2_0
#define CL_API_SUFFIX__VERSION_2_1
#define CL_EXT_SUFFIX__VERSION_2_1
#define CL_API_SUFFIX__VERSION_2_2
#define CL_EXT_SUFFIX__VERSION_2_2
#define CL_API_SUFFIX__VERSION_3_0
#define CL_EXT_SUFFIX__VERSION_3_0
#define CL_API_SUFFIX__EXPERIMENTAL
#define CL_EXT_SUFFIX__EXPERIMENTAL
#ifdef __GNUC__
#define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
#define CL_EXT_PREFIX_DEPRECATED
#elif defined(_WIN32)
#define CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
#else
#define CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
#else
#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
#endif
#if (defined (_WIN32) && defined(_MSC_VER))
/* scalar types */
typedef signed __int8 cl_char;
typedef unsigned __int8 cl_uchar;
typedef signed __int16 cl_short;
typedef unsigned __int16 cl_ushort;
typedef signed __int32 cl_int;
typedef unsigned __int32 cl_uint;
typedef signed __int64 cl_long;
typedef unsigned __int64 cl_ulong;
typedef unsigned __int16 cl_half;
typedef float cl_float;
typedef double cl_double;
/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT 8
#define CL_SCHAR_MAX 127
#define CL_SCHAR_MIN (-127-1)
#define CL_CHAR_MAX CL_SCHAR_MAX
#define CL_CHAR_MIN CL_SCHAR_MIN
#define CL_UCHAR_MAX 255
#define CL_SHRT_MAX 32767
#define CL_SHRT_MIN (-32767-1)
#define CL_USHRT_MAX 65535
#define CL_INT_MAX 2147483647
#define CL_INT_MIN (-2147483647-1)
#define CL_UINT_MAX 0xffffffffU
#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
#define CL_FLT_DIG 6
#define CL_FLT_MANT_DIG 24
#define CL_FLT_MAX_10_EXP +38
#define CL_FLT_MAX_EXP +128
#define CL_FLT_MIN_10_EXP -37
#define CL_FLT_MIN_EXP -125
#define CL_FLT_RADIX 2
#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
#define CL_FLT_MIN 1.175494350822287507969e-38f
#define CL_FLT_EPSILON 1.1920928955078125e-7f
#define CL_HALF_DIG 3
#define CL_HALF_MANT_DIG 11
#define CL_HALF_MAX_10_EXP +4
#define CL_HALF_MAX_EXP +16
#define CL_HALF_MIN_10_EXP -4
#define CL_HALF_MIN_EXP -13
#define CL_HALF_RADIX 2
#define CL_HALF_MAX 65504.0f
#define CL_HALF_MIN 6.103515625e-05f
#define CL_HALF_EPSILON 9.765625e-04f
#define CL_DBL_DIG 15
#define CL_DBL_MANT_DIG 53
#define CL_DBL_MAX_10_EXP +308
#define CL_DBL_MAX_EXP +1024
#define CL_DBL_MIN_10_EXP -307
#define CL_DBL_MIN_EXP -1021
#define CL_DBL_RADIX 2
#define CL_DBL_MAX 1.7976931348623158e+308
#define CL_DBL_MIN 2.225073858507201383090e-308
#define CL_DBL_EPSILON 2.220446049250313080847e-16
#define CL_M_E 2.7182818284590452354
#define CL_M_LOG2E 1.4426950408889634074
#define CL_M_LOG10E 0.43429448190325182765
#define CL_M_LN2 0.69314718055994530942
#define CL_M_LN10 2.30258509299404568402
#define CL_M_PI 3.14159265358979323846
#define CL_M_PI_2 1.57079632679489661923
#define CL_M_PI_4 0.78539816339744830962
#define CL_M_1_PI 0.31830988618379067154
#define CL_M_2_PI 0.63661977236758134308
#define CL_M_2_SQRTPI 1.12837916709551257390
#define CL_M_SQRT2 1.41421356237309504880
#define CL_M_SQRT1_2 0.70710678118654752440
#define CL_M_E_F 2.718281828f
#define CL_M_LOG2E_F 1.442695041f
#define CL_M_LOG10E_F 0.434294482f
#define CL_M_LN2_F 0.693147181f
#define CL_M_LN10_F 2.302585093f
#define CL_M_PI_F 3.141592654f
#define CL_M_PI_2_F 1.570796327f
#define CL_M_PI_4_F 0.785398163f
#define CL_M_1_PI_F 0.318309886f
#define CL_M_2_PI_F 0.636619772f
#define CL_M_2_SQRTPI_F 1.128379167f
#define CL_M_SQRT2_F 1.414213562f
#define CL_M_SQRT1_2_F 0.707106781f
#define CL_NAN (CL_INFINITY - CL_INFINITY)
#define CL_HUGE_VALF ((cl_float) 1e50)
#define CL_HUGE_VAL ((cl_double) 1e500)
#define CL_MAXFLOAT CL_FLT_MAX
#define CL_INFINITY CL_HUGE_VALF
#else
#include
/* scalar types */
typedef int8_t cl_char;
typedef uint8_t cl_uchar;
typedef int16_t cl_short;
typedef uint16_t cl_ushort;
typedef int32_t cl_int;
typedef uint32_t cl_uint;
typedef int64_t cl_long;
typedef uint64_t cl_ulong;
typedef uint16_t cl_half;
typedef float cl_float;
typedef double cl_double;
/* Macro names and corresponding values defined by OpenCL */
#define CL_CHAR_BIT 8
#define CL_SCHAR_MAX 127
#define CL_SCHAR_MIN (-127-1)
#define CL_CHAR_MAX CL_SCHAR_MAX
#define CL_CHAR_MIN CL_SCHAR_MIN
#define CL_UCHAR_MAX 255
#define CL_SHRT_MAX 32767
#define CL_SHRT_MIN (-32767-1)
#define CL_USHRT_MAX 65535
#define CL_INT_MAX 2147483647
#define CL_INT_MIN (-2147483647-1)
#define CL_UINT_MAX 0xffffffffU
#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
#define CL_FLT_DIG 6
#define CL_FLT_MANT_DIG 24
#define CL_FLT_MAX_10_EXP +38
#define CL_FLT_MAX_EXP +128
#define CL_FLT_MIN_10_EXP -37
#define CL_FLT_MIN_EXP -125
#define CL_FLT_RADIX 2
#define CL_FLT_MAX 340282346638528859811704183484516925440.0f
#define CL_FLT_MIN 1.175494350822287507969e-38f
#define CL_FLT_EPSILON 1.1920928955078125e-7f
#define CL_HALF_DIG 3
#define CL_HALF_MANT_DIG 11
#define CL_HALF_MAX_10_EXP +4
#define CL_HALF_MAX_EXP +16
#define CL_HALF_MIN_10_EXP -4
#define CL_HALF_MIN_EXP -13
#define CL_HALF_RADIX 2
#define CL_HALF_MAX 65504.0f
#define CL_HALF_MIN 6.103515625e-05f
#define CL_HALF_EPSILON 9.765625e-04f
#define CL_DBL_DIG 15
#define CL_DBL_MANT_DIG 53
#define CL_DBL_MAX_10_EXP +308
#define CL_DBL_MAX_EXP +1024
#define CL_DBL_MIN_10_EXP -307
#define CL_DBL_MIN_EXP -1021
#define CL_DBL_RADIX 2
#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
#define CL_DBL_MIN 2.225073858507201383090e-308
#define CL_DBL_EPSILON 2.220446049250313080847e-16
#define CL_M_E 2.7182818284590452354
#define CL_M_LOG2E 1.4426950408889634074
#define CL_M_LOG10E 0.43429448190325182765
#define CL_M_LN2 0.69314718055994530942
#define CL_M_LN10 2.30258509299404568402
#define CL_M_PI 3.14159265358979323846
#define CL_M_PI_2 1.57079632679489661923
#define CL_M_PI_4 0.78539816339744830962
#define CL_M_1_PI 0.31830988618379067154
#define CL_M_2_PI 0.63661977236758134308
#define CL_M_2_SQRTPI 1.12837916709551257390
#define CL_M_SQRT2 1.41421356237309504880
#define CL_M_SQRT1_2 0.70710678118654752440
#define CL_M_E_F 2.718281828f
#define CL_M_LOG2E_F 1.442695041f
#define CL_M_LOG10E_F 0.434294482f
#define CL_M_LN2_F 0.693147181f
#define CL_M_LN10_F 2.302585093f
#define CL_M_PI_F 3.141592654f
#define CL_M_PI_2_F 1.570796327f
#define CL_M_PI_4_F 0.785398163f
#define CL_M_1_PI_F 0.318309886f
#define CL_M_2_PI_F 0.636619772f
#define CL_M_2_SQRTPI_F 1.128379167f
#define CL_M_SQRT2_F 1.414213562f
#define CL_M_SQRT1_2_F 0.707106781f
#if defined( __GNUC__ )
#define CL_HUGE_VALF __builtin_huge_valf()
#define CL_HUGE_VAL __builtin_huge_val()
#define CL_NAN __builtin_nanf( "" )
#else
#define CL_HUGE_VALF ((cl_float) 1e50)
#define CL_HUGE_VAL ((cl_double) 1e500)
float nanf( const char * );
#define CL_NAN nanf( "" )
#endif
#define CL_MAXFLOAT CL_FLT_MAX
#define CL_INFINITY CL_HUGE_VALF
#endif
#include
/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
typedef unsigned int cl_GLuint;
typedef int cl_GLint;
typedef unsigned int cl_GLenum;
/*
* Vector types
*
* Note: OpenCL requires that all types be naturally aligned.
* This means that vector types must be naturally aligned.
* For example, a vector of four floats must be aligned to
* a 16 byte boundary (calculated as 4 * the natural 4-byte
* alignment of the float). The alignment qualifiers here
* will only function properly if your compiler supports them
* and if you don't actively work to defeat them. For example,
* in order for a cl_float4 to be 16 byte aligned in a struct,
* the start of the struct must itself be 16-byte aligned.
*
* Maintaining proper alignment is the user's responsibility.
*/
/* Define basic vector types */
#if defined( __VEC__ )
#include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
typedef __vector unsigned char __cl_uchar16;
typedef __vector signed char __cl_char16;
typedef __vector unsigned short __cl_ushort8;
typedef __vector signed short __cl_short8;
typedef __vector unsigned int __cl_uint4;
typedef __vector signed int __cl_int4;
typedef __vector float __cl_float4;
#define __CL_UCHAR16__ 1
#define __CL_CHAR16__ 1
#define __CL_USHORT8__ 1
#define __CL_SHORT8__ 1
#define __CL_UINT4__ 1
#define __CL_INT4__ 1
#define __CL_FLOAT4__ 1
#endif
#if defined( __SSE__ )
#if defined( __MINGW64__ )
#include
#else
#include
#endif
#if defined( __GNUC__ )
typedef float __cl_float4 __attribute__((vector_size(16)));
#else
typedef __m128 __cl_float4;
#endif
#define __CL_FLOAT4__ 1
#endif
#if defined( __SSE2__ )
#if defined( __MINGW64__ )
#include
#else
#include
#endif
#if defined( __GNUC__ )
typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16)));
typedef cl_char __cl_char16 __attribute__((vector_size(16)));
typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16)));
typedef cl_short __cl_short8 __attribute__((vector_size(16)));
typedef cl_uint __cl_uint4 __attribute__((vector_size(16)));
typedef cl_int __cl_int4 __attribute__((vector_size(16)));
typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16)));
typedef cl_long __cl_long2 __attribute__((vector_size(16)));
typedef cl_double __cl_double2 __attribute__((vector_size(16)));
#else
typedef __m128i __cl_uchar16;
typedef __m128i __cl_char16;
typedef __m128i __cl_ushort8;
typedef __m128i __cl_short8;
typedef __m128i __cl_uint4;
typedef __m128i __cl_int4;
typedef __m128i __cl_ulong2;
typedef __m128i __cl_long2;
typedef __m128d __cl_double2;
#endif
#define __CL_UCHAR16__ 1
#define __CL_CHAR16__ 1
#define __CL_USHORT8__ 1
#define __CL_SHORT8__ 1
#define __CL_INT4__ 1
#define __CL_UINT4__ 1
#define __CL_ULONG2__ 1
#define __CL_LONG2__ 1
#define __CL_DOUBLE2__ 1
#endif
#if defined( __MMX__ )
#include
#if defined( __GNUC__ )
typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8)));
typedef cl_char __cl_char8 __attribute__((vector_size(8)));
typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8)));
typedef cl_short __cl_short4 __attribute__((vector_size(8)));
typedef cl_uint __cl_uint2 __attribute__((vector_size(8)));
typedef cl_int __cl_int2 __attribute__((vector_size(8)));
typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8)));
typedef cl_long __cl_long1 __attribute__((vector_size(8)));
typedef cl_float __cl_float2 __attribute__((vector_size(8)));
#else
typedef __m64 __cl_uchar8;
typedef __m64 __cl_char8;
typedef __m64 __cl_ushort4;
typedef __m64 __cl_short4;
typedef __m64 __cl_uint2;
typedef __m64 __cl_int2;
typedef __m64 __cl_ulong1;
typedef __m64 __cl_long1;
typedef __m64 __cl_float2;
#endif
#define __CL_UCHAR8__ 1
#define __CL_CHAR8__ 1
#define __CL_USHORT4__ 1
#define __CL_SHORT4__ 1
#define __CL_INT2__ 1
#define __CL_UINT2__ 1
#define __CL_ULONG1__ 1
#define __CL_LONG1__ 1
#define __CL_FLOAT2__ 1
#endif
#if defined( __AVX__ )
#if defined( __MINGW64__ )
#include
#else
#include
#endif
#if defined( __GNUC__ )
typedef cl_float __cl_float8 __attribute__((vector_size(32)));
typedef cl_double __cl_double4 __attribute__((vector_size(32)));
#else
typedef __m256 __cl_float8;
typedef __m256d __cl_double4;
#endif
#define __CL_FLOAT8__ 1
#define __CL_DOUBLE4__ 1
#endif
/* Define capabilities for anonymous struct members. */
#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
#define __CL_HAS_ANON_STRUCT__ 1
#define __CL_ANON_STRUCT__
#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
#define __CL_HAS_ANON_STRUCT__ 1
#define __CL_ANON_STRUCT__ __extension__
#elif defined( _WIN32) && defined(_MSC_VER)
#if _MSC_VER >= 1500
/* Microsoft Developer Studio 2008 supports anonymous structs, but
* complains by default. */
#define __CL_HAS_ANON_STRUCT__ 1
#define __CL_ANON_STRUCT__
/* Disable warning C4201: nonstandard extension used : nameless
* struct/union */
#pragma warning( push )
#pragma warning( disable : 4201 )
#endif
#else
#define __CL_HAS_ANON_STRUCT__ 0
#define __CL_ANON_STRUCT__
#endif
/* Define alignment keys */
#if defined( __GNUC__ )
#define CL_ALIGNED(_x) __attribute__ ((aligned(_x)))
#elif defined( _WIN32) && (_MSC_VER)
/* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */
/* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */
/* #include */
/* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */
#define CL_ALIGNED(_x)
#else
#warning Need to implement some method to align data here
#define CL_ALIGNED(_x)
#endif
/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
#if __CL_HAS_ANON_STRUCT__
/* .xyzw and .s0123...{f|F} are supported */
#define CL_HAS_NAMED_VECTOR_FIELDS 1
/* .hi and .lo are supported */
#define CL_HAS_HI_LO_VECTOR_FIELDS 1
#endif
/* Define cl_vector types */
/* ---- cl_charn ---- */
typedef union
{
cl_char CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_char x, y; };
__CL_ANON_STRUCT__ struct{ cl_char s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_char lo, hi; };
#endif
#if defined( __CL_CHAR2__)
__cl_char2 v2;
#endif
}cl_char2;
typedef union
{
cl_char CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
__cl_char2 v2[2];
#endif
#if defined( __CL_CHAR4__)
__cl_char4 v4;
#endif
}cl_char4;
/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
typedef cl_char4 cl_char3;
typedef union
{
cl_char CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
__cl_char2 v2[4];
#endif
#if defined( __CL_CHAR4__)
__cl_char4 v4[2];
#endif
#if defined( __CL_CHAR8__ )
__cl_char8 v8;
#endif
}cl_char8;
typedef union
{
cl_char CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
#endif
#if defined( __CL_CHAR2__)
__cl_char2 v2[8];
#endif
#if defined( __CL_CHAR4__)
__cl_char4 v4[4];
#endif
#if defined( __CL_CHAR8__ )
__cl_char8 v8[2];
#endif
#if defined( __CL_CHAR16__ )
__cl_char16 v16;
#endif
}cl_char16;
/* ---- cl_ucharn ---- */
typedef union
{
cl_uchar CL_ALIGNED(2) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uchar x, y; };
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; };
#endif
#if defined( __cl_uchar2__)
__cl_uchar2 v2;
#endif
}cl_uchar2;
typedef union
{
cl_uchar CL_ALIGNED(4) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
__cl_uchar2 v2[2];
#endif
#if defined( __CL_UCHAR4__)
__cl_uchar4 v4;
#endif
}cl_uchar4;
/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
typedef cl_uchar4 cl_uchar3;
typedef union
{
cl_uchar CL_ALIGNED(8) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
__cl_uchar2 v2[4];
#endif
#if defined( __CL_UCHAR4__)
__cl_uchar4 v4[2];
#endif
#if defined( __CL_UCHAR8__ )
__cl_uchar8 v8;
#endif
}cl_uchar8;
typedef union
{
cl_uchar CL_ALIGNED(16) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
#endif
#if defined( __CL_UCHAR2__)
__cl_uchar2 v2[8];
#endif
#if defined( __CL_UCHAR4__)
__cl_uchar4 v4[4];
#endif
#if defined( __CL_UCHAR8__ )
__cl_uchar8 v8[2];
#endif
#if defined( __CL_UCHAR16__ )
__cl_uchar16 v16;
#endif
}cl_uchar16;
/* ---- cl_shortn ---- */
typedef union
{
cl_short CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_short x, y; };
__CL_ANON_STRUCT__ struct{ cl_short s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_short lo, hi; };
#endif
#if defined( __CL_SHORT2__)
__cl_short2 v2;
#endif
}cl_short2;
typedef union
{
cl_short CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
__cl_short2 v2[2];
#endif
#if defined( __CL_SHORT4__)
__cl_short4 v4;
#endif
}cl_short4;
/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
typedef cl_short4 cl_short3;
typedef union
{
cl_short CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
__cl_short2 v2[4];
#endif
#if defined( __CL_SHORT4__)
__cl_short4 v4[2];
#endif
#if defined( __CL_SHORT8__ )
__cl_short8 v8;
#endif
}cl_short8;
typedef union
{
cl_short CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
#endif
#if defined( __CL_SHORT2__)
__cl_short2 v2[8];
#endif
#if defined( __CL_SHORT4__)
__cl_short4 v4[4];
#endif
#if defined( __CL_SHORT8__ )
__cl_short8 v8[2];
#endif
#if defined( __CL_SHORT16__ )
__cl_short16 v16;
#endif
}cl_short16;
/* ---- cl_ushortn ---- */
typedef union
{
cl_ushort CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ushort x, y; };
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; };
#endif
#if defined( __CL_USHORT2__)
__cl_ushort2 v2;
#endif
}cl_ushort2;
typedef union
{
cl_ushort CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
__cl_ushort2 v2[2];
#endif
#if defined( __CL_USHORT4__)
__cl_ushort4 v4;
#endif
}cl_ushort4;
/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
typedef cl_ushort4 cl_ushort3;
typedef union
{
cl_ushort CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
__cl_ushort2 v2[4];
#endif
#if defined( __CL_USHORT4__)
__cl_ushort4 v4[2];
#endif
#if defined( __CL_USHORT8__ )
__cl_ushort8 v8;
#endif
}cl_ushort8;
typedef union
{
cl_ushort CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
#endif
#if defined( __CL_USHORT2__)
__cl_ushort2 v2[8];
#endif
#if defined( __CL_USHORT4__)
__cl_ushort4 v4[4];
#endif
#if defined( __CL_USHORT8__ )
__cl_ushort8 v8[2];
#endif
#if defined( __CL_USHORT16__ )
__cl_ushort16 v16;
#endif
}cl_ushort16;
/* ---- cl_halfn ---- */
typedef union
{
cl_half CL_ALIGNED(4) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_half x, y; };
__CL_ANON_STRUCT__ struct{ cl_half s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_half lo, hi; };
#endif
#if defined( __CL_HALF2__)
__cl_half2 v2;
#endif
}cl_half2;
typedef union
{
cl_half CL_ALIGNED(8) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
#endif
#if defined( __CL_HALF2__)
__cl_half2 v2[2];
#endif
#if defined( __CL_HALF4__)
__cl_half4 v4;
#endif
}cl_half4;
/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
typedef cl_half4 cl_half3;
typedef union
{
cl_half CL_ALIGNED(16) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
#endif
#if defined( __CL_HALF2__)
__cl_half2 v2[4];
#endif
#if defined( __CL_HALF4__)
__cl_half4 v4[2];
#endif
#if defined( __CL_HALF8__ )
__cl_half8 v8;
#endif
}cl_half8;
typedef union
{
cl_half CL_ALIGNED(32) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
#endif
#if defined( __CL_HALF2__)
__cl_half2 v2[8];
#endif
#if defined( __CL_HALF4__)
__cl_half4 v4[4];
#endif
#if defined( __CL_HALF8__ )
__cl_half8 v8[2];
#endif
#if defined( __CL_HALF16__ )
__cl_half16 v16;
#endif
}cl_half16;
/* ---- cl_intn ---- */
typedef union
{
cl_int CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_int x, y; };
__CL_ANON_STRUCT__ struct{ cl_int s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_int lo, hi; };
#endif
#if defined( __CL_INT2__)
__cl_int2 v2;
#endif
}cl_int2;
typedef union
{
cl_int CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
#endif
#if defined( __CL_INT2__)
__cl_int2 v2[2];
#endif
#if defined( __CL_INT4__)
__cl_int4 v4;
#endif
}cl_int4;
/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
typedef cl_int4 cl_int3;
typedef union
{
cl_int CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
#endif
#if defined( __CL_INT2__)
__cl_int2 v2[4];
#endif
#if defined( __CL_INT4__)
__cl_int4 v4[2];
#endif
#if defined( __CL_INT8__ )
__cl_int8 v8;
#endif
}cl_int8;
typedef union
{
cl_int CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
#endif
#if defined( __CL_INT2__)
__cl_int2 v2[8];
#endif
#if defined( __CL_INT4__)
__cl_int4 v4[4];
#endif
#if defined( __CL_INT8__ )
__cl_int8 v8[2];
#endif
#if defined( __CL_INT16__ )
__cl_int16 v16;
#endif
}cl_int16;
/* ---- cl_uintn ---- */
typedef union
{
cl_uint CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uint x, y; };
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_uint lo, hi; };
#endif
#if defined( __CL_UINT2__)
__cl_uint2 v2;
#endif
}cl_uint2;
typedef union
{
cl_uint CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
#endif
#if defined( __CL_UINT2__)
__cl_uint2 v2[2];
#endif
#if defined( __CL_UINT4__)
__cl_uint4 v4;
#endif
}cl_uint4;
/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
typedef cl_uint4 cl_uint3;
typedef union
{
cl_uint CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
#endif
#if defined( __CL_UINT2__)
__cl_uint2 v2[4];
#endif
#if defined( __CL_UINT4__)
__cl_uint4 v4[2];
#endif
#if defined( __CL_UINT8__ )
__cl_uint8 v8;
#endif
}cl_uint8;
typedef union
{
cl_uint CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
#endif
#if defined( __CL_UINT2__)
__cl_uint2 v2[8];
#endif
#if defined( __CL_UINT4__)
__cl_uint4 v4[4];
#endif
#if defined( __CL_UINT8__ )
__cl_uint8 v8[2];
#endif
#if defined( __CL_UINT16__ )
__cl_uint16 v16;
#endif
}cl_uint16;
/* ---- cl_longn ---- */
typedef union
{
cl_long CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_long x, y; };
__CL_ANON_STRUCT__ struct{ cl_long s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_long lo, hi; };
#endif
#if defined( __CL_LONG2__)
__cl_long2 v2;
#endif
}cl_long2;
typedef union
{
cl_long CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
#endif
#if defined( __CL_LONG2__)
__cl_long2 v2[2];
#endif
#if defined( __CL_LONG4__)
__cl_long4 v4;
#endif
}cl_long4;
/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
typedef cl_long4 cl_long3;
typedef union
{
cl_long CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
#endif
#if defined( __CL_LONG2__)
__cl_long2 v2[4];
#endif
#if defined( __CL_LONG4__)
__cl_long4 v4[2];
#endif
#if defined( __CL_LONG8__ )
__cl_long8 v8;
#endif
}cl_long8;
typedef union
{
cl_long CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
#endif
#if defined( __CL_LONG2__)
__cl_long2 v2[8];
#endif
#if defined( __CL_LONG4__)
__cl_long4 v4[4];
#endif
#if defined( __CL_LONG8__ )
__cl_long8 v8[2];
#endif
#if defined( __CL_LONG16__ )
__cl_long16 v16;
#endif
}cl_long16;
/* ---- cl_ulongn ---- */
typedef union
{
cl_ulong CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ulong x, y; };
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; };
#endif
#if defined( __CL_ULONG2__)
__cl_ulong2 v2;
#endif
}cl_ulong2;
typedef union
{
cl_ulong CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
__cl_ulong2 v2[2];
#endif
#if defined( __CL_ULONG4__)
__cl_ulong4 v4;
#endif
}cl_ulong4;
/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
typedef cl_ulong4 cl_ulong3;
typedef union
{
cl_ulong CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
__cl_ulong2 v2[4];
#endif
#if defined( __CL_ULONG4__)
__cl_ulong4 v4[2];
#endif
#if defined( __CL_ULONG8__ )
__cl_ulong8 v8;
#endif
}cl_ulong8;
typedef union
{
cl_ulong CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
#endif
#if defined( __CL_ULONG2__)
__cl_ulong2 v2[8];
#endif
#if defined( __CL_ULONG4__)
__cl_ulong4 v4[4];
#endif
#if defined( __CL_ULONG8__ )
__cl_ulong8 v8[2];
#endif
#if defined( __CL_ULONG16__ )
__cl_ulong16 v16;
#endif
}cl_ulong16;
/* --- cl_floatn ---- */
typedef union
{
cl_float CL_ALIGNED(8) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_float x, y; };
__CL_ANON_STRUCT__ struct{ cl_float s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_float lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
__cl_float2 v2;
#endif
}cl_float2;
typedef union
{
cl_float CL_ALIGNED(16) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
__cl_float2 v2[2];
#endif
#if defined( __CL_FLOAT4__)
__cl_float4 v4;
#endif
}cl_float4;
/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
typedef cl_float4 cl_float3;
typedef union
{
cl_float CL_ALIGNED(32) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
__cl_float2 v2[4];
#endif
#if defined( __CL_FLOAT4__)
__cl_float4 v4[2];
#endif
#if defined( __CL_FLOAT8__ )
__cl_float8 v8;
#endif
}cl_float8;
typedef union
{
cl_float CL_ALIGNED(64) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
#endif
#if defined( __CL_FLOAT2__)
__cl_float2 v2[8];
#endif
#if defined( __CL_FLOAT4__)
__cl_float4 v4[4];
#endif
#if defined( __CL_FLOAT8__ )
__cl_float8 v8[2];
#endif
#if defined( __CL_FLOAT16__ )
__cl_float16 v16;
#endif
}cl_float16;
/* --- cl_doublen ---- */
typedef union
{
cl_double CL_ALIGNED(16) s[2];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_double x, y; };
__CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
__CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
__cl_double2 v2;
#endif
}cl_double2;
typedef union
{
cl_double CL_ALIGNED(32) s[4];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; };
__CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
__cl_double2 v2[2];
#endif
#if defined( __CL_DOUBLE4__)
__cl_double4 v4;
#endif
}cl_double4;
/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
typedef cl_double4 cl_double3;
typedef union
{
cl_double CL_ALIGNED(64) s[8];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; };
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; };
__CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
__cl_double2 v2[4];
#endif
#if defined( __CL_DOUBLE4__)
__cl_double4 v4[2];
#endif
#if defined( __CL_DOUBLE8__ )
__cl_double8 v8;
#endif
}cl_double8;
typedef union
{
cl_double CL_ALIGNED(128) s[16];
#if __CL_HAS_ANON_STRUCT__
__CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
__CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
__CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
#endif
#if defined( __CL_DOUBLE2__)
__cl_double2 v2[8];
#endif
#if defined( __CL_DOUBLE4__)
__cl_double4 v4[4];
#endif
#if defined( __CL_DOUBLE8__ )
__cl_double8 v8[2];
#endif
#if defined( __CL_DOUBLE16__ )
__cl_double16 v16;
#endif
}cl_double16;
/* Macro to facilitate debugging
* Usage:
* Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
* The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \"
* Each line thereafter of OpenCL C source must end with: \n\
* The last line ends in ";
*
* Example:
*
* const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
* kernel void foo( int a, float * b ) \n\
* { \n\
* // my comment \n\
* *b[ get_global_id(0)] = a; \n\
* } \n\
* ";
*
* This should correctly set up the line, (column) and file information for your source
* string so you can do source level debugging.
*/
#define __CL_STRINGIFY( _x ) # _x
#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x )
#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
#ifdef __cplusplus
}
#endif
#undef __CL_HAS_ANON_STRUCT__
#undef __CL_ANON_STRUCT__
#if defined( _WIN32) && defined(_MSC_VER)
#if _MSC_VER >=1500
#pragma warning( pop )
#endif
#endif
#endif /* __CL_PLATFORM_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_va_api_media_sharing_intel.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_va_api_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#include
#include
#include
#ifdef __cplusplus
extern "C" {
#endif
/******************************************
* cl_intel_va_api_media_sharing extension *
*******************************************/
#define cl_intel_va_api_media_sharing 1
/* error codes */
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101
/* cl_va_api_device_source_intel */
#define CL_VA_API_DISPLAY_INTEL 0x4094
/* cl_va_api_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095
#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096
/* cl_context_info */
#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097
/* cl_mem_info */
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098
/* cl_image_info */
#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B
typedef cl_uint cl_va_api_device_source_intel;
typedef cl_uint cl_va_api_device_set_intel;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/cl_version.h
================================================
/*******************************************************************************
* Copyright (c) 2018-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __CL_VERSION_H
#define __CL_VERSION_H
/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
#define CL_TARGET_OPENCL_VERSION 220
#endif
#if CL_TARGET_OPENCL_VERSION != 100 && \
CL_TARGET_OPENCL_VERSION != 110 && \
CL_TARGET_OPENCL_VERSION != 120 && \
CL_TARGET_OPENCL_VERSION != 200 && \
CL_TARGET_OPENCL_VERSION != 210 && \
CL_TARGET_OPENCL_VERSION != 220 && \
CL_TARGET_OPENCL_VERSION != 300
#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)")
#undef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 220
#endif
/* OpenCL Version */
#if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0)
#define CL_VERSION_3_0 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
#define CL_VERSION_2_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
#define CL_VERSION_2_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
#define CL_VERSION_2_0 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
#define CL_VERSION_1_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
#define CL_VERSION_1_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
#define CL_VERSION_1_0 1
#endif
/* Allow deprecated APIs for older OpenCL versions. */
#if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif
#endif /* __CL_VERSION_H */
================================================
FILE: GpuMemLatency/OpenCL/include/CL/opencl.h
================================================
/*******************************************************************************
* Copyright (c) 2008-2020 The Khronos Group Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
#ifndef __OPENCL_H
#define __OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include
#include
#include
#include
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_H */
================================================
FILE: GpuMemLatency/atomic_test.c
================================================
#include "opencltest.h"
float int_atomic_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations,
short local,
uint32_t *time_ms)
{
cl_int ret;
cl_int result = 0;
size_t global_item_size = 2;
size_t local_item_size = 1;
float latency;
uint32_t time_diff_ms;
uint32_t A = 0;
if (local)
{
local_item_size = 2;
}
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
clFinish(command_queue);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
goto cleanup;
}
clFinish(command_queue);
time_diff_ms = end_timing();
*time_ms = time_diff_ms;
latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
return latency;
}
float c2c_atomic_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations)
{
cl_int ret;
cl_int result = 0;
size_t global_item_size;
size_t local_item_size = 1;
float latency;
uint32_t time_diff_ms;
uint32_t A;
cl_uint cuCount = getCuCount();
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ret);
global_item_size = cuCount;
float* result_arr = (float*)malloc(sizeof(float) * cuCount * cuCount);
for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)
{
for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)
{
if (t1_idx == t2_idx) continue;
fprintf(stderr, "Testing %d -> %d\n", t1_idx, t2_idx);
A = 0;
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
clFinish(command_queue);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&t1_idx);
clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&t2_idx);
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
goto cleanup;
}
clFinish(command_queue);
time_diff_ms = end_timing();
latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2;
fprintf(stderr, "%d -> %d: %f\n", t1_idx, t2_idx, latency);
result_arr[t1_idx * cuCount + t2_idx] = latency;
}
}
for (cl_int i = 0; i < cuCount; i++)
{
printf(",%d", i);
}
printf("\n");
for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++)
{
printf("%d", t1_idx);
for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++)
{
if (t1_idx == t2_idx) printf(",x");
else printf(",%f", result_arr[t1_idx * cuCount + t2_idx]);
}
printf("\n");
}
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(result_arr);
return latency;
}
float int_atomic_add_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
size_t threads,
size_t localsize)
{
// Loop unroll factor
const float opsPerIteration = 8.0f;
cl_int ret;
int64_t time_diff_ms = 0;
float gOpsPerSec;
uint32_t iterations = 7000;
uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * threads);
for (int i = 0; i < threads; i++) A[i] = i;
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * threads, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t) * threads, A, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
clFinish(command_queue);
while (time_diff_ms < TARGET_TIME_MS / 2) {
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &threads, &localsize, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
gOpsPerSec = 0;
goto int_atomic_add_test_end;
}
clFinish(command_queue);
time_diff_ms = end_timing();
float totalOps = (float)iterations * opsPerIteration * (float)threads;
gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
fprintf(stderr, "GOPS: %f, elapsed time: %lld\n", gOpsPerSec, time_diff_ms);
iterations = adjust_iterations(iterations, time_diff_ms);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
}
int_atomic_add_test_end:
clReleaseMemObject(a_mem_obj);
free(A);
return gOpsPerSec;
}
================================================
FILE: GpuMemLatency/bw_test.c
================================================
#include "opencltest.h"
float bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint64_t list_size,
uint32_t thread_count,
uint32_t local_size,
uint32_t skip,
uint32_t chase_iterations)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float bandwidth, total_data_gb;
cl_int ret;
cl_int float4size = list_size / 4;
int64_t time_diff_ms;
if (skip == 0)
{
// nemes's read-combining-defeating heuristic
uint32_t region_size = list_size * sizeof(float);
uint32_t current_region_steps = (uint32_t)(region_size / (local_size * 4));
skip = (chase_iterations + current_region_steps + 1) * local_size * 4;
}
float* A = (float*)malloc(sizeof(float) * list_size);
float* result = (float*)malloc(sizeof(float) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", list_size);
}
// assume that cl_uint size is 4 bytes, same as float size
cl_uint* start_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);
cl_uint* calculated_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count);
memset(calculated_offsets, 0, sizeof(uint32_t) * thread_count);
for (uint32_t i = 0; i < list_size; i++)
{
A[i] = (float)(i * 0.5);
}
// tell each thread where to start
for (uint32_t i = 0; i < thread_count; i++)
{
uint32_t localId = i % local_size;
uint32_t groupId = i / local_size;
start_offsets[i] = (cl_uint)((groupId * skip * local_size + localId) % (float4size - 1));
// randomly start each workgroup somewhere - ends up being really bad
/*cl_uint groupOffset = rand() % (float4size / local_size);
start_offsets[i] = (cl_uint)((groupOffset * local_size + localId) % (float4size - 1));*/
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(float), A, 0, NULL, NULL);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);
cl_mem start_offsets_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint) * thread_count, NULL, &ret);
if (ret != 0) fprintf(stderr, "create buffer for start offsets failed. ret = %d\n", ret);
ret = clEnqueueWriteBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(cl_uint) * thread_count, start_offsets, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue write buffer for start offsets failed. ret = %d\n", ret);
// Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&float4size);
clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&result_obj);
clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&skip);
clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&start_offsets_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
time_diff_ms = end_timing();
// each thread does iterations reads
total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;
bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
//fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
ret = clEnqueueReadBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, calculated_offsets, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for start offsets failed. ret = %d\n", ret);
clFinish(command_queue);
if (memcmp(calculated_offsets, start_offsets, sizeof(uint32_t) * thread_count))
{
fprintf(stderr, "mismatch in calculated start offsets\n");
for (uint32_t i = 0; i < thread_count; i++)
{
if (calculated_offsets[i] != start_offsets[i]) {
fprintf(stderr, "At index %u, calculated from GPU = %u, calculated on CPU = %u. skip=%u\n", i, calculated_offsets[i], start_offsets[i], skip);
break;
}
}
}
//fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
clReleaseMemObject(start_offsets_obj);
free(A);
free(result);
free(start_offsets);
free(calculated_offsets);
return bandwidth;
}
float tex_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint64_t width,
uint64_t height,
uint32_t thread_count,
uint32_t local_size,
uint32_t randomize,
uint32_t chase_iterations,
int64_t *time_ms)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float texels = 0;
cl_int ret;
int64_t time_diff_ms;
uint64_t tex_array_size = 3 * width * height; // texture size in bytes
cl_mem tex_mem_obj = NULL, a_mem_obj = NULL, result_obj = NULL;
float* A = (float*)malloc(sizeof(float) * tex_array_size);
float* result = (float*)malloc(sizeof(float) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for %lu x %lu texture\n", width, height);
}
// fill array
for (uint64_t i = 0; i < tex_array_size; i++)
{
A[i] = randomize ? rand() * 0.2f : (float)(i * 0.5);
}
// create texture from it
//a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, tex_array_size * sizeof(float), A, &ret);
//ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, tex_array_size * sizeof(float), A, 0, NULL, NULL);
cl_image_desc imageDesc;
memset(&imageDesc, 0, sizeof(cl_image_desc));
imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
imageDesc.image_width = width;
imageDesc.image_height = height;
//imageDesc.mem_object = a_mem_obj;
//imageDesc.buffer = A;
cl_image_format imageFormat;
imageFormat.image_channel_order = CL_R;
imageFormat.image_channel_data_type = CL_FLOAT;
tex_mem_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, A, &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to create 2d texture: %d\n", ret);
goto tex_bw_cleanup;
}
size_t origin[] = { 0, 0, 0 };
size_t region[] = { width, height, 1 };
ret = clEnqueueWriteImage(command_queue, tex_mem_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to copy 2d texture: %d\n", ret);
goto tex_bw_cleanup;
}
fprintf(stderr, "Created image\n");
// copy array to device
result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);
// Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions)
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
texels = 0;
goto tex_bw_cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
texels = 0;
goto tex_bw_cleanup;
}
time_diff_ms = end_timing();
fprintf(stderr, "elapsed time: %lld ms\n", time_diff_ms);
// each thread does iterations samples, and each sample returns a 4-wide vector
texels = 1000 * (float)(chase_iterations * thread_count * 4 / 1e9) / (float)time_diff_ms;
fprintf(stderr, "%u iterations, %u threads, %lu ms\n", chase_iterations, thread_count, time_diff_ms);
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
*time_ms = time_diff_ms;
tex_bw_cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(tex_mem_obj);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return texels;
}
// must be at least as large as local memory test size in kernel
// list size in 32-bit elements
#define local_mem_bw_test_size 8192
float local_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t *time_ms)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float bandwidth, total_data_gb;
cl_int ret;
int64_t time_diff_ms;
float* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);
float* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
}
for (uint32_t i = 0; i < local_mem_bw_test_size; i++)
{
A[i] = i + .02;
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(float), A, 0, NULL, NULL);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
time_diff_ms = end_timing();
*time_ms = time_diff_ms;
// each thread does iterations reads
total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
//fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return bandwidth;
}
#define buffer_test_size 4096 // 1024x uint4
float buffer_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t* time_ms)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float bandwidth, total_data_gb;
cl_int ret;
int64_t time_diff_ms;
cl_mem result_obj;
uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * buffer_test_size);
float* result = (uint32_t*)malloc(sizeof(float) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
}
for (uint32_t i = 0; i < buffer_test_size; i++)
{
A[i] = i + 1;
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_test_size * sizeof(uint32_t), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, buffer_test_size * sizeof(uint32_t), A, 0, NULL, NULL);
// handle cl_image stuff
cl_image_format imageFormat;
imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
imageFormat.image_channel_order = CL_R;
cl_image_desc imageDesc;
memset(&imageDesc, 0, sizeof(cl_image_desc));
imageDesc.buffer = a_mem_obj;
imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
imageDesc.image_width = buffer_test_size; // width in pixels
cl_mem tex_obj = tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);
size_t origin[] = { 0, 0, 0 };
size_t region[] = { imageDesc.image_width, 1, 1 };
ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);
result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
time_diff_ms = end_timing();
*time_ms = time_diff_ms;
// each thread does iterations reads
total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
//fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return bandwidth;
}
float local_chase_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
uint32_t wave_size,
int64_t* time_ms)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float bandwidth, total_data_gb;
cl_int ret;
int64_t time_diff_ms;
uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size);
uint32_t* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4);
}
for (uint32_t i = 0; i < local_mem_bw_test_size; i++)
{
// assumes local_mem_bw_test_size is a power of 2.
A[i] = i + wave_size & (local_mem_bw_test_size - 1);
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(uint32_t), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(uint32_t), A, 0, NULL, NULL);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
time_diff_ms = end_timing();
*time_ms = time_diff_ms;
// each thread does iterations reads
total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9;
bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
//fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb);
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return bandwidth;
}
#define local64_test_size 2048
float local_64_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t* time_ms)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float bandwidth, total_data_gb;
cl_int ret;
int64_t time_diff_ms;
uint64_t* A = (uint64_t*)malloc(sizeof(uint64_t) * local64_test_size);
uint64_t* result = (uint64_t*)malloc(sizeof(uint64_t) * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local64_test_size * 4);
}
for (uint64_t i = 0; i < local64_test_size; i++)
{
A[i] = i;
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local64_test_size * sizeof(uint64_t), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local64_test_size * sizeof(uint64_t), A, 0, NULL, NULL);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint64_t) * thread_count, NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue); // writes should be blocking, but are they?
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
bandwidth = 0;
goto cleanup;
}
time_diff_ms = end_timing();
*time_ms = time_diff_ms;
// each thread does iterations reads
total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9;
bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL);
if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret);
clFinish(command_queue);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return bandwidth;
}
// default test sizes for link bandwidth
const uint64_t default_link_test_sizes[] = { 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152 };
void link_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations)
{
cl_int ret;
cl_int result = 0;
size_t global_item_size;
size_t local_item_size = 1;
float gpu_to_host_bandwidth, host_to_gpu_bandwidth, total_data_gb;
uint32_t time_diff_ms, loop_iterations;
uint32_t* A;
int test_size_count = sizeof(default_link_test_sizes) / sizeof(unsigned long long);
float* results = (float*)malloc(sizeof(float) * 2 * test_size_count);
memset(results, 0, sizeof(float) * 2 * test_size_count);
printf("Copy Size (KB), Host to GPU (GB/s), GPU to Host (GB/s)\n");
for (int size_idx = 0; size_idx < test_size_count; size_idx++) {
uint64_t testSizeBytes = default_link_test_sizes[size_idx] * 1024;
uint64_t testSizeKb = default_link_test_sizes[size_idx];
if (testSizeBytes > max_global_test_size) {
printf("%d K would exceed device's max buffer size of %lu K, stopping here.\n", testSizeKb, max_global_test_size / 1024);
break;
}
A = (uint32_t*)malloc(testSizeBytes);
memset(A, 0, testSizeBytes);
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, testSizeBytes, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
global_item_size = 1; // only hit the first element, not like we're going to spend time verifying an entire arr especially at large sizes
// use 1M iterations = 1 GB total to transfer
loop_iterations = ((uint64_t)iterations * 1000) / (uint64_t)testSizeBytes;
//fprintf(stderr, "Size: %llu KB, Iterations: %d, base iterations: %d\n", testSizeKb, loop_iterations, iterations);
start_timing();
for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)
{
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);
clFinish(command_queue);
}
time_diff_ms = end_timing();
total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;
host_to_gpu_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
results[size_idx * 2] = host_to_gpu_bandwidth;
//fprintf(stderr, "Write to GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms);
start_timing();
for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++)
{
ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL);
clFinish(command_queue);
}
time_diff_ms = end_timing();
total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9;
gpu_to_host_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms;
results[size_idx * 2 + 1] = gpu_to_host_bandwidth;
//fprintf(stderr, "Read from GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms);
printf("%llu,%f,%f\n", testSizeKb, host_to_gpu_bandwidth, gpu_to_host_bandwidth);
clReleaseMemObject(a_mem_obj);
free(A);
}
float max = 0;
for (int size_idx = 0; size_idx < test_size_count; size_idx++) {
if (results[size_idx * 2] > max) max = results[size_idx * 2];
if (results[size_idx * 2 + 1] > max) max = results[size_idx * 2 + 1];
}
printf("Link bandwidth: %f GB/s\n", max);
cleanup:
free(results);
clFlush(command_queue);
clFinish(command_queue);
}
================================================
FILE: GpuMemLatency/common.c
================================================
#include "opencltest.h"
cl_device_id selected_device_id;
cl_platform_id selected_platform_id;
cl_ulong max_global_test_size;
int saveprogram = 0;
// Fills an array using Sattolo's algo
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) {
uint32_t increment = byte_increment / sizeof(uint32_t);
uint32_t element_count = list_size / increment;
for (int i = 0; i < element_count; i++) {
pattern_arr[i * increment] = i * increment;
}
int iter = element_count;
while (iter > 1) {
iter -= 1;
int j = iter - 1 == 0 ? 0 : rand() % (iter - 1);
uint32_t tmp = pattern_arr[iter * increment];
pattern_arr[iter * increment] = pattern_arr[j * increment];
pattern_arr[j * increment] = tmp;
}
}
cl_uint getCuCount() {
cl_uint cuCount;
size_t cuCountLen = sizeof(cl_uint);
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, cuCountLen, &cuCount, &cuCountLen))
{
fprintf(stderr, "Could not get number of compute units\n");
return 0;
}
return cuCount;
}
size_t getMaxWorkgroupSize()
{
size_t maxWorkgroupSize;
size_t workgroupSizeLen = sizeof(size_t);
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, workgroupSizeLen, &maxWorkgroupSize, &workgroupSizeLen))
{
fprintf(stderr, "Could not get number of compute units\n");
return 0;
}
return maxWorkgroupSize;
}
cl_ulong get_max_constant_buffer_size() {
cl_ulong constant_buffer_size = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &constant_buffer_size, NULL)) {
fprintf(stderr, "Failed to get max constant buffer size\n");
}
return constant_buffer_size;
}
cl_ulong get_max_buffer_size() {
cl_ulong buffer_size = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {
fprintf(stderr, "Failed to get max constant buffer size\n");
}
return buffer_size;
}
cl_ulong get_max_tex_buffer_size() {
cl_ulong buffer_size = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) {
fprintf(stderr, "Failed to get max texture buffer size\n");
}
return buffer_size;
}
cl_ulong get_max_2d_tex_width() {
cl_ulong max_width = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(cl_ulong), &max_width, NULL)) {
fprintf(stderr, "Failed to get max texture width\n");
}
return max_width;
}
cl_ulong get_max_2d_tex_height() {
cl_ulong max_width = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(cl_ulong), &max_width, NULL)) {
fprintf(stderr, "Failed to get max texture height\n");
}
return max_width;
}
short checkExtensionSupport(const char *extension_name) {
size_t extensionLen = 0;
char* extensions;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &extensionLen))
{
fprintf(stderr, "Could not determine memory needed to hold OpenCL extension list\n");
return 0;
}
extensions = (char *)malloc(extensionLen + 1);
extensions[extensionLen] = 0;
if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, extensionLen, extensions, &extensionLen))
{
fprintf(stderr, "Could not get OpenCL extensions list\n");
return 0;
}
//fprintf(stderr, "OpenCL extensions list: %s\n", extensions);
// extension list is space separated
size_t spaceCount = 0;
for (int i = 0; i < extensionLen; i++) {
if (extensions[i] == ' ') spaceCount++;
}
int* extensionsSpaces = (int*)malloc(sizeof(int) * (spaceCount + 1));
extensionsSpaces[0] = 0;
int spaceIdx = 1;
for (int i = 0; i < extensionLen; i++) {
if (extensions[i] == ' ') {
extensions[i] = 0;
extensionsSpaces[spaceIdx] = i + 1;
spaceIdx++;
}
}
short found = 0;
for (int i = 0; i < spaceCount; i++)
{
//fprintf(stderr, "Looking for %s = %s\n", extension_name, extensions + extensionsSpaces[i]);
if (strcmp(extension_name, extensions + extensionsSpaces[i]) == 0) {
found = 1;
//fprintf(stderr, "found\n");
break;
}
}
free(extensionsSpaces);
free(extensions);
return found;
}
///
/// populate global variables for opencl device id and platform id
///
/// platform index. if -1, prompt user
/// device index. if -1. prompt user
/// opencl context
cl_context get_context_from_user(int platform_index, int device_index) {
int i = 0;
int selected_platform_index = 0, selected_device_index = 0;
// Get platform and device information
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
cl_platform_id* platforms = NULL;
cl_device_id* devices = NULL;
cl_context context = NULL;
platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id));
ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
fprintf(stderr, "clGetPlatformIDs returned %d. %d platforms\n", ret, ret_num_platforms);
for (i = 0; i < ret_num_platforms; i++)
{
size_t platform_name_len;
char* platform_name = NULL;
if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) {
fprintf(stderr, "Failed to get platform info for platform %d\n", i);
continue;
}
platform_name = (char*)malloc(platform_name_len + 1);
platform_name[platform_name_len] = 0;
if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) {
fprintf(stderr, "Failed to get platform name for platform %d\n", i);
free(platform_name);
continue;
}
fprintf(stderr, "Platform %d: %s\n", i, platform_name);
free(platform_name);
}
selected_platform_index = platform_index;
if (selected_platform_index == -1)
{
printf("Enter platform #:");
scanf("%d", &selected_platform_index);
}
if (selected_platform_index > ret_num_platforms - 1)
{
fprintf(stderr, "platform index out of range\n");
goto get_context_from_user_end;
}
selected_platform_id = platforms[selected_platform_index];
if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) {
fprintf(stderr, "Failed to enumerate device ids for platform");
return NULL;
}
devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id));
if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) {
fprintf(stderr, "Failed to get device ids for platform");
free(devices);
return NULL;
}
fprintf(stderr, "clGetDeviceIDs returned %d devices\n", ret_num_devices);
for (i = 0; i < ret_num_devices; i++)
{
size_t device_name_len;
char* device_name = NULL;
if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) {
fprintf(stderr, "Failed to get name length for device %d\n", i);
continue;
}
//fprintf(stderr, "debug: device name length: %d\n", device_name_len);
device_name = (char*)malloc(device_name_len + 1);
device_name[device_name_len] = 0;
if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) {
fprintf(stderr, "Failed to get name for device %d\n", i);
free(device_name);
continue;
}
fprintf(stderr, "Device %d: %s\n", i, device_name);
free(device_name);
}
selected_device_index = device_index;
if (selected_device_index == -1)
{
fprintf(stderr, "Enter device #:");
scanf("%d", &selected_device_index);
}
if (selected_device_index > ret_num_devices - 1)
{
fprintf(stderr, "Device index out of range\n");
goto get_context_from_user_end;
}
selected_device_id = devices[selected_device_index];
// Create an OpenCL context
context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret);
fprintf(stderr, "clCreateContext returned %d\n", ret);
fprintf(stderr, "Max workgroup size for device: %u\n", getMaxWorkgroupSize());
get_context_from_user_end:
free(platforms);
free(devices);
return context;
}
cl_program build_program(cl_context context, const char* fname, const char *params)
{
cl_int ret;
FILE* fp = NULL;
char* source_str;
size_t source_size;
fp = fopen(fname, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel %s.\n", fname);
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL);
//fprintf(stderr, "clBuildProgram %s returned %d\n", fname, ret);
if (ret == -11)
{
size_t log_size;
fprintf(stderr, "OpenCL kernel build error\n");
clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char* log = (char*)malloc(log_size);
clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
fprintf(stderr, "%s\n", log);
free(log);
}
free(source_str);
return program;
}
void write_program(cl_program program, const char *name)
{
size_t* binarySizes = NULL;
size_t nDevices = 0;
cl_int ret, memoryRequired = 0;
char fname[255];
int i;
unsigned char** binaries = NULL;
ret = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nDevices, NULL);
if (ret != CL_SUCCESS) {
fprintf(stderr, "Could not get number of devices for program\n");
return;
}
fprintf(stderr, "Program is associated with %llu devices\n", nDevices);
binarySizes = (size_t*)malloc(sizeof(size_t) * nDevices);
if (binarySizes == NULL)
{
fprintf(stderr, "Failed to allocate memory for binary sizes\n");
goto getProgram_Fail;
}
ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, binarySizes, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Could not get program binary sizes\n");
goto getProgram_Fail;
}
binaries = (unsigned char*)malloc(nDevices);
for (i = 0; i < nDevices; i++) {
fprintf(stderr, "Device %d: %llu byte program\n", i, binarySizes[i]);
binaries[i] = (char*)malloc(binarySizes[i]);
}
ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, nDevices * sizeof(unsigned char*), binaries, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Could not get program binaries\n");
goto getProgram_Fail;
}
for (int i = 0; i < nDevices; i++)
{
snprintf(fname, 254, "prog%d_%s", i, name);
FILE* dst = fopen(fname, "w");
fwrite(binaries[i], 1, binarySizes[i], dst);
fclose(dst);
fprintf(stderr, "Wrote compiled kernel to %s\n", fname);
}
getProgram_Fail:
for (int i = 0; i < nDevices; i++) free(binaries[i]);
free(binaries);
free(binarySizes);
}
// Given last run settings, return target iteration count that should make the next run
// go for approximately TARGET_TIME_MS
uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms)
{
uint32_t chase_iterations = (uint32_t)((float)iterations * TARGET_TIME_MS / (float)time_ms);
if (time_ms == 0) chase_iterations = iterations * 100;
//fprintf(stderr, "Kernel took %llu ms. Setting iterations = %u\n", time_ms, chase_iterations);
return chase_iterations;
}
================================================
FILE: GpuMemLatency/instruction_rate.c
================================================
#include "opencltest.h"
float fp64_instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result);
float fp16_instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result);
float run_rate_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result,
float totalOps);
float run_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result,
float opsPerIteration);
float global_totalOps;
float instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int forcefp16,
int forcefp64)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float gOpsPerSec = 0, opsPerIteration;
cl_int ret;
int64_t time_diff_ms;
int float4_element_count = thread_count * 4;
cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL);
if (saveprogram) write_program(program, "irate");
cl_kernel int32_add_rate_kernel = clCreateKernel(program, "int32_add_rate_test", &ret);
cl_kernel int32_mul_rate_kernel = clCreateKernel(program, "int32_mul_rate_test", &ret);
cl_kernel fp32_add_rate_kernel = clCreateKernel(program, "fp32_add_rate_test", &ret);
cl_kernel fp32_fma_rate_kernel = clCreateKernel(program, "fp32_fma_rate_test", &ret);
cl_kernel fp32_builtin_fma_rate_kernel = clCreateKernel(program, "fp32_builtin_fma_rate_test", &ret);
cl_kernel fp32_mad_rate_kernel = clCreateKernel(program, "fp32_mad_rate_test", &ret);
cl_kernel fp32_rcp_rate_kernel = clCreateKernel(program, "fp32_rcp_rate_test", &ret);
cl_kernel fp32_rsqrt_rate_kernel = clCreateKernel(program, "fp32_rsqrt_rate_test", &ret);
cl_kernel mix_fp32_int32_add_rate_kernel = clCreateKernel(program, "mix_fp32_int32_add_rate_test", &ret);
cl_kernel mix_fp32_int32_addmul_rate_kernel = clCreateKernel(program, "mix_fp32_int32_addmul_rate_test", &ret);
cl_kernel int64_add_rate_kernel = clCreateKernel(program, "int64_add_rate_test", &ret);
cl_kernel int64_mul_rate_kernel = clCreateKernel(program, "int64_mul_rate_test", &ret);
cl_kernel int16_add_rate_kernel = clCreateKernel(program, "int16_add_rate_test", &ret);
cl_kernel int16_mul_rate_kernel = clCreateKernel(program, "int16_mul_rate_test", &ret);
cl_kernel int8_add_rate_kernel = clCreateKernel(program, "int8_add_rate_test", &ret);
cl_kernel int8_mul_rate_kernel = clCreateKernel(program, "int8_mul_rate_test", &ret);
cl_kernel fp32_fma_latency_kernel = clCreateKernel(program, "fp32_fma_latency_test", &ret);
cl_kernel fp32_add_latency_kernel = clCreateKernel(program, "fp32_add_latency_test", &ret);
cl_kernel int32_add_latency_kernel = clCreateKernel(program, "int32_add_latency_test", &ret);
cl_kernel int32_mul_latency_kernel = clCreateKernel(program, "int32_mul_latency_test", &ret);
cl_kernel int32_add_scalar_latency_kernel = clCreateKernel(program, "int32_add_scalar_latency_test", &ret);
cl_kernel int32_mul_scalar_latency_kernel = clCreateKernel(program, "int32_mul_scalar_latency_test", &ret);
cl_kernel fp32_add_scalar_latency_kernel = clCreateKernel(program, "fp32_add_scalar_latency_test", &ret);
cl_kernel fp32_fma_scalar_latency_kernel = clCreateKernel(program, "fp32_fma_scalar_latency_test", &ret);
cl_kernel fp32_mul_scalar_latency_kernel = clCreateKernel(program, "fp32_mul_scalar_latency_test", &ret);
cl_kernel fp32_mul_latency_kernel = clCreateKernel(program, "fp32_mul_latency_test", &ret);
float* A = (float*)malloc(sizeof(float) * float4_element_count * 4);
float* result = (float*)malloc(sizeof(float) * 4 * thread_count);
if (!A || !result)
{
fprintf(stderr, "Failed to allocate memory instruction rate test\n");
}
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, float4_element_count * sizeof(float), NULL, &ret);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 4 * thread_count, NULL, &ret);
// Integer test first
uint32_t *int32_A = (uint32_t*)A;
for (int i = 0; i < float4_element_count * 4; i++)
{
int32_A[i] = i + 1;
}
// 4x int4 * 8 per iteration, and count the loop increment too
opsPerIteration = 4.0f * 8.0f;
float int32_add_rate = run_rate_test(context, command_queue, int32_add_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT32 G Adds/sec: %f\n", int32_add_rate);
printf("===== INT32 add latency =====\n");
float int32_add_latency = run_latency_test(context, command_queue, int32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "INT32 add latency: %f ns\n", int32_add_latency);
printf("===== INT32 add latency (scalar) =====\n");
int32_add_latency = run_latency_test(context, command_queue, int32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "INT32 add latency (scalar): %f ns\n", int32_add_latency);
printf("===== INT32 mul latency =====\n");
float int32_mul_latency = run_latency_test(context, command_queue, int32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "INT32 mul latency: %f ns\n", int32_mul_latency);
printf("===== INT32 mul latency (scalar) =====\n");
int32_mul_latency = run_latency_test(context, command_queue, int32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "INT32 mul latency (scalar): %f ns\n", int32_mul_latency);
opsPerIteration = 4.0f * 8.0f;
float int32_mul_rate = run_rate_test(context, command_queue, int32_mul_rate_kernel, thread_count, local_size, (chase_iterations / 2),
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT32 G Multiplies/sec: %f\n", int32_mul_rate);
// FP32 add and fma test
cl_float* fp32_A = (cl_float*)A;
for (int i = 0; i < float4_element_count * 4; i++)
{
fp32_A[i] = 0.5f * i;
}
opsPerIteration = 4.0f * 8.0f;
float fp32_add_rate = run_rate_test(context, command_queue, fp32_add_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G Adds/sec: %f\n", fp32_add_rate);
printf("===== FP32 add latency =====\n");
float fp32_add_latency = run_latency_test(context, command_queue, fp32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 add latency: %f ns\n", fp32_add_latency);
printf("===== FP32 add latency (scalar) =====\n");
fp32_add_latency = run_latency_test(context, command_queue, fp32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 add latency (scalar): %f ns\n", fp32_add_latency);
printf("===== FP32 fma latency =====\n");
float fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 FMA latency: %f ns\n", fp32_fma_latency);
printf("===== FP32 fma latency (scalar) =====\n");
fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 FMA latency (scalar): %f ns\n", fp32_fma_latency);
printf("===== FP32 mul latency =====\n");
fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 mul latency: %f ns\n", fp32_fma_latency);
fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f);
fprintf(stderr, "FP32 mul latency (scalar): %f ns\n", fp32_fma_latency);
float fp32_fma_rate = run_rate_test(context, command_queue, fp32_fma_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G FMA/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2);
float builtin_fp32_fma_rate = run_rate_test(context, command_queue, fp32_builtin_fma_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G fma()/sec: %f : %f GFLOPs\n", builtin_fp32_fma_rate, builtin_fp32_fma_rate * 2);
fp32_fma_rate = run_rate_test(context, command_queue, fp32_mad_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G mad()/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2);
float fp32_rcp_rate = run_rate_test(context, command_queue, fp32_rcp_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G native_recip/sec: %f\n", fp32_rcp_rate);
float fp32_rsqrt_rate = run_rate_test(context, command_queue, fp32_rsqrt_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "FP32 G native_rsqrt/sec: %f\n", fp32_rsqrt_rate);
// Mixed INT32 and FP32 - 4 FP32, 4 INT32, and the loop increment
// takes FP inputs and converts some to int
opsPerIteration = 4.0f * 8.0f + 1.0f;
float mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_add_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "Mixed INT32 and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate);
// Test the same with integer multiplies
mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_addmul_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "Mixed INT32 Multiplies and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate);
// INT64 add test
cl_ulong* int64_A = (cl_ulong*)A;
for (int i = 0; i < float4_element_count * 2; i++)
{
int64_A[i] = i * 2;
}
opsPerIteration = 2.0f * 8.0f;
float int64_add_rate = run_rate_test(context, command_queue, int64_add_rate_kernel, thread_count, local_size, chase_iterations / 2,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT64 G Adds/sec: %f\n", int64_add_rate);
opsPerIteration = 2.0f * 8.0f;
float int64_mul_rate = run_rate_test(context, command_queue, int64_mul_rate_kernel, thread_count, local_size, chase_iterations / 8,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT64 G Multiplies/sec: %f\n", int64_mul_rate);
// INT16 (short) tests
cl_ushort* int16_A = (cl_ushort*)A;
for (int i = 0; i < float4_element_count * 8; i++)
{
int16_A[i] = i;
}
// short8
opsPerIteration = 8.0f * 8.0f;
float int16_add_rate = run_rate_test(context, command_queue, int16_add_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT16 G Adds/sec: %f\n", int16_add_rate);
float int16_mul_rate = run_rate_test(context, command_queue, int16_mul_rate_kernel, thread_count, local_size, chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT16 G Multiplies/sec: %f \n", int16_mul_rate);
// INT8 (char) tests
cl_char* int8_A = (cl_char*)A;
for (int i = 0; i < float4_element_count * 8; i++)
{
int8_A[i] = i;
}
uint32_t int8_chase_iterations = chase_iterations / 10;
opsPerIteration = 16.0f * 8.0f;
float int8_add_rate = run_rate_test(context, command_queue, int8_add_rate_kernel, thread_count, local_size, int8_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT8 G Adds/sec: %f\n", int8_add_rate);
float int8_mul_rate = run_rate_test(context, command_queue, int8_mul_rate_kernel, thread_count, local_size, int8_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration);
fprintf(stderr, "INT8 G Multiplies/sec: %f\n", int8_mul_rate);
short checkExtensionSupport(const char *extension_name);
if (checkExtensionSupport("cl_khr_fp64") || forcefp64) {
fp64_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,
a_mem_obj, result_obj, A, result);
}
else {
fprintf(stderr, "FP64 not supported\n");
}
if (checkExtensionSupport("cl_khr_fp16") || forcefp16) {
fp16_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count,
a_mem_obj, result_obj, A, result);
}
else {
fprintf(stderr, "FP16 not supported\n");
}
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
return gOpsPerSec;
}
// Runs an instruction rate test. The kernel is expected to perform opsPerIteration * chase_iterations operations
// Mostly simplifies the uber instruction rate test above. Expects memory to be pre-allocated for example.
// Returns GOPS
float run_rate_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result,
float opsPerIteration)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
cl_int ret;
float totalOps, gOpsPerSec;
uint64_t time_diff_ms = 0;
memset(result, 0, sizeof(float) * 4 * thread_count);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4 * thread_count, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue);
//fprintf(stderr, "Submitting fp32 add kernel to command queue\n");
// start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
while (time_diff_ms < TARGET_TIME_MS / 2) {
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
gOpsPerSec = 0;
return 0;
}
ret = clFinish(command_queue);
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
gOpsPerSec = 0;
return 0;
}
time_diff_ms = end_timing();
totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count;
gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
//fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
//fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);
chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
}
return gOpsPerSec;
}
// Variation of the test above but input array size is aligned with assumed wave size.
// if partitioning pattern, this will test partitioning with active waves in the specified pattern
float run_divergence_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t wave,
int *partitionPattern)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
uint32_t active_threads = thread_count;
cl_int ret;
float totalOps, gOpsPerSec;
uint64_t time_diff_ms = 0;
uint32_t chase_iterations = 2500000;
cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL);
cl_kernel kernel = clCreateKernel(program, partitionPattern == NULL ? "fp32_divergence_rate_test" : "fp32_partition_rate_test", &ret);
float* result = (float*)malloc(sizeof(float) * thread_count);
float* A = (float*)malloc(sizeof(float) * thread_count);
memset(result, 0, sizeof(float) * thread_count);
if (partitionPattern != NULL) active_threads = 0;
if (partitionPattern != NULL) fprintf(stderr, "\n");
for (int i = 0; i < thread_count; i++)
{
if (partitionPattern == NULL) {
// divergence test
if ((i / wave) % 2 == 0) A[i] = 0.2f;
else A[i] = 0.8f;
}
else
{
if (partitionPattern[(i / wave)]) {
A[i] = 0.2f;
fprintf(stderr, "a ");
active_threads++;
}
else
{
fprintf(stderr, "_ ");
A[i] = 1.2f;
}
if ((i + 1) % wave == 0)
{
fprintf(stderr, "\n");
}
}
}
if (partitionPattern != NULL) fprintf(stderr, "\nActive threads: %d\n", active_threads);
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, thread_count * sizeof(float), NULL, &ret);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, thread_count * sizeof(float), NULL, &ret);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, thread_count * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, thread_count * sizeof(float), result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue);
// start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
while (time_diff_ms < TARGET_TIME_MS / 2) {
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
gOpsPerSec = 0;
return 0;
}
ret = clFinish(command_queue);
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
gOpsPerSec = 0;
return 0;
}
time_diff_ms = end_timing();
totalOps = (float)chase_iterations * 8 * (float)active_threads;
gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000);
//fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
//fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);
chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
}
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
free(result);
clReleaseKernel(kernel);
clReleaseProgram(program);
return gOpsPerSec;
}
// often takes time for clocks to settle?
#define LATENCY_REPEAT 5
float run_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result,
float opsPerIteration)
{
size_t global_item_size = 1;
size_t local_item_size = 1;
cl_int ret;
float latency;
uint64_t time_diff_ms = 0;
// hack around latency taking longer
chase_iterations = chase_iterations / 50;
// testing returning a float4
memset(result, 0, sizeof(float) * 4);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
clFinish(command_queue);
//fprintf(stderr, "Submitting fp32 add kernel to command queue\n");
// start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment
while (time_diff_ms < TARGET_TIME_MS / 2) {
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
return 0;
}
ret = clFinish(command_queue);
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
latency = 0;
return 0;
}
time_diff_ms = end_timing();
chase_iterations = adjust_iterations(chase_iterations, time_diff_ms);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
}
float totalOps = (float)chase_iterations * opsPerIteration * (float)global_item_size;
latency = (float)time_diff_ms * 1e6 / totalOps;
// fprintf(stderr, "\tinitial run: %f ns latency\n", latency);
float minLatency = 0.0f;
for (int i = 0; i < LATENCY_REPEAT; i++)
{
start_timing();
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
clFinish(command_queue);
time_diff_ms = end_timing();
latency = (float)time_diff_ms * 1e6 / totalOps;
// fprintf(stderr, "\trun %d: %f ns latency\n", i, latency);
if (i == 0 || latency < minLatency) minLatency = latency;
}
//fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count);
//fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms);
return minLatency;
}
// taking out FP64 because some implementations don't support it. putting another build program + create kernel section
// in the main instruction rate test function would be too messy
float fp64_instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float *A,
cl_float*result)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float gOpsPerSec, totalOps;
cl_int ret;
int64_t time_diff_ms;
// FP64 add test
uint32_t low_chase_iterations = chase_iterations / 4;
cl_double* fp64_A = (cl_double*)A;
for (int i = 0; i < float4_element_count * 2; i++)
{
fp64_A[i] = 0.5f * i;
}
memset(result, 0, sizeof(float) * 4 * thread_count);
cl_program program = build_program(context, "instruction_rate_fp64_kernel.cl", NULL);
if (saveprogram) write_program(program, "fp64irate");
cl_kernel fp64_add_rate_kernel = clCreateKernel(program, "fp64_add_rate_test", &ret);
cl_kernel fp64_fma_rate_kernel = clCreateKernel(program, "fp64_fma_rate_test", &ret);
cl_kernel fp64_mad_rate_kernel = clCreateKernel(program, "fp64_mad_rate_test", &ret);
totalOps = 2.0f * 8.0f;
gOpsPerSec = run_rate_test(context, command_queue, fp64_add_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP64 G Adds/sec: %f\n", gOpsPerSec);
gOpsPerSec = run_rate_test(context, command_queue, fp64_fma_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP64 G FMAs/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);
gOpsPerSec = run_rate_test(context, command_queue, fp64_mad_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP64 G mad()/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);
return gOpsPerSec;
}
// taking out FP16 too because it requires an extension to be supported
float fp16_instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int float4_element_count,
cl_mem a_mem_obj,
cl_mem result_obj,
cl_float* A,
cl_float* result)
{
size_t global_item_size = thread_count;
size_t local_item_size = local_size;
float gOpsPerSec, totalOps;
cl_int ret;
int64_t time_diff_ms;
// FP64 add test
uint32_t low_chase_iterations = chase_iterations / 4;
cl_half* fp16_A = (cl_float*)A;
for (int i = 0; i < float4_element_count * 8; i++)
{
fp16_A[i] = (cl_half)(0.5f * i);
}
memset(result, 0, sizeof(float) * 4 * thread_count);
cl_program program = build_program(context, "instruction_rate_fp16_kernel.cl", NULL);
if (saveprogram) write_program(program, "fp16irate");
cl_kernel fp16_add_rate_kernel = clCreateKernel(program, "fp16_add_rate_test", &ret);
cl_kernel fp16_fma_rate_kernel = clCreateKernel(program, "fp16_fma_rate_test", &ret);
//cl_kernel fp16_rsqrt_rate_kernel = clCreateKernel(program, "fp16_rsqrt_rate_test", &ret);
totalOps = 8.0f * 8.0f;
gOpsPerSec = run_rate_test(context, command_queue, fp16_add_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP16 G Adds/sec: %f\n", gOpsPerSec);
gOpsPerSec = run_rate_test(context, command_queue, fp16_fma_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP16 G FMAs/sec: %f : %f FP16 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2);
/*gOpsPerSec = run_rate_test(context, command_queue, fp16_rsqrt_rate_kernel, thread_count, local_size, low_chase_iterations,
float4_element_count, a_mem_obj, result_obj, A, result, totalOps);
fprintf(stderr, "FP16 G native_rsqrt/sec: %f\n", gOpsPerSec);*/
return gOpsPerSec;
}
================================================
FILE: GpuMemLatency/instruction_rate_fp16_kernel.cl
================================================
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define rate_local_mem_test_size 256
__kernel void fp16_add_rate_test(__global half8 *A, int count, __global half8 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global half8 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
half8 v0 = local_a[masked_tid];
half8 v1 = local_a[masked_tid + 1];
half8 v2 = local_a[masked_tid + 2];
half8 v3 = local_a[masked_tid + 3];
half8 v4 = v0 + v1;
half8 v5 = v0 + v2;
half8 v6 = v0 + v3;
half8 v7 = v1 + v2;
half8 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp16_fma_rate_test(__global half8 *A, int count, __global half8 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global half8 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
half8 v0 = local_a[masked_tid];
half8 v1 = local_a[masked_tid + 1];
half8 v2 = local_a[masked_tid + 2];
half8 v3 = local_a[masked_tid + 3];
half8 v4 = v0 + v1;
half8 v5 = v0 + v2;
half8 v6 = v0 + v3;
half8 v7 = v1 + v2;
half8 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc * v0;
v1 += acc * v1;
v2 += acc * v2;
v3 += acc * v3;
v4 += acc * v4;
v5 += acc * v5;
v6 += acc * v6;
v7 += acc * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
/*__kernel void fp16_rsqrt_rate_test(__global half8 *A, int count, __global half8 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global half8 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
half8 v0 = local_a[masked_tid];
half8 v1 = local_a[masked_tid + 1];
half8 v2 = local_a[masked_tid + 2];
half8 v3 = local_a[masked_tid + 3];
half8 v4 = v0 + v1;
half8 v5 = v0 + v2;
half8 v6 = v0 + v3;
half8 v7 = v1 + v2;
for (int i = 0; i < count; i++) {
v0 = native_rsqrt(v0);
v1 = native_rsqrt(v1);
v2 = native_rsqrt(v2);
v3 = native_rsqrt(v3);
v4 = native_rsqrt(v4);
v5 = native_rsqrt(v5);
v6 = native_rsqrt(v6);
v7 = native_rsqrt(v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
*/
================================================
FILE: GpuMemLatency/instruction_rate_fp64_kernel.cl
================================================
#define rate_local_mem_test_size 256
__kernel void fp64_add_rate_test(__global double2 *A, int count, __global double2 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global double2 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
double2 v0 = local_a[masked_tid];
double2 v1 = local_a[masked_tid + 1];
double2 v2 = local_a[masked_tid + 2];
double2 v3 = local_a[masked_tid + 3];
double2 v4 = v0 + v1;
double2 v5 = v0 + v2;
double2 v6 = v0 + v3;
double2 v7 = v1 + v2;
double2 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp64_fma_rate_test(__global double2 *A, int count, __global double2 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global double2 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
double2 v0 = local_a[masked_tid];
double2 v1 = local_a[masked_tid + 1];
double2 v2 = local_a[masked_tid + 2];
double2 v3 = local_a[masked_tid + 3];
double2 v4 = v0 + v1;
double2 v5 = v0 + v2;
double2 v6 = v0 + v3;
double2 v7 = v1 + v2;
double2 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc * v0;
v1 += acc * v1;
v2 += acc * v2;
v3 += acc * v3;
v4 += acc * v4;
v5 += acc * v5;
v6 += acc * v6;
v7 += acc * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp64_mad_rate_test(__global double2 *A, int count, __global double2 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global double2 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
double2 v0 = local_a[masked_tid];
double2 v1 = local_a[masked_tid + 1];
double2 v2 = local_a[masked_tid + 2];
double2 v3 = local_a[masked_tid + 3];
double2 v4 = v0 + v1;
double2 v5 = v0 + v2;
double2 v6 = v0 + v3;
double2 v7 = v1 + v2;
double2 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 = mad(acc, v0, v0);
v1 = mad(acc, v1, v1);
v2 = mad(acc, v2, v2);
v3 = mad(acc, v3, v3);
v4 = mad(acc, v4, v3);
v5 = mad(acc, v5, v5);
v6 = mad(acc, v6, v6);
v7 = mad(acc, v7, v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
================================================
FILE: GpuMemLatency/instruction_rate_kernel.cl
================================================
#define rate_local_mem_test_size 512
// A must be at least (local size * 4) uint32 elements in size, but must not exceed local mem size
// jk it doesn't use local mem now
__kernel void int32_add_rate_test(__global uint4 *A, int count, __global uint4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__local uint4 local_a[rate_local_mem_test_size];
for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// __global uint4 *local_a = A;
int masked_tid = min(tid, rate_local_mem_test_size - 8);
uint4 v0 = local_a[masked_tid];
uint4 v1 = local_a[masked_tid + 1];
uint4 v2 = local_a[masked_tid + 2];
uint4 v3 = local_a[masked_tid + 3];
uint4 v4 = local_a[masked_tid + 4];
uint4 v5 = local_a[masked_tid + 5];
uint4 v6 = local_a[masked_tid + 6];
uint4 v7 = local_a[masked_tid + 7];
for (int i = 0; i < count; i++) {
uint4 acc = local_a[i & (rate_local_mem_test_size - 1)];
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int32_mul_rate_test(__global uint4 *A, int count, __global uint4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global uint4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
uint4 v0 = local_a[masked_tid];
uint4 v1 = local_a[masked_tid + 1];
uint4 v2 = local_a[masked_tid + 2];
uint4 v3 = local_a[masked_tid + 3];
uint4 v4 = v0 + v1;
uint4 v5 = v0 + v2;
uint4 v6 = v0 + v3;
uint4 v7 = v1 + v2;
uint4 acc = local_a[0];
for (int i = 0; i < count; i++) {
//uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
v0 *= acc;
v1 *= acc;
v2 *= acc;
v3 *= acc;
v4 *= acc;
v5 *= acc;
v6 *= acc;
v7 *= acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = v0 + v1;
float4 v5 = v0 + v2;
float4 v6 = v0 + v3;
float4 v7 = v1 + v2;
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
//float4 acc = local_a[i & (rate_local_mem_test_size) - 1];
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_rcp_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = v0 + v1;
float4 v5 = v0 + v2;
float4 v6 = v0 + v3;
float4 v7 = v1 + v2;
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 = native_recip(v0);
v1 = native_recip(v1);
v2 = native_recip(v2);
v3 = native_recip(v3);
v4 = native_recip(v4);
v5 = native_recip(v5);
v6 = native_recip(v6);
v7 = native_recip(v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_rsqrt_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = v0 + v1;
float4 v5 = v0 + v2;
float4 v6 = v0 + v3;
float4 v7 = v1 + v2;
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 = native_rsqrt(v0);
v1 = native_rsqrt(v1);
v2 = native_rsqrt(v2);
v3 = native_rsqrt(v3);
v4 = native_rsqrt(v4);
v5 = native_rsqrt(v5);
v6 = native_rsqrt(v6);
v7 = native_rsqrt(v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int64_add_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global ulong2 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
ulong2 v0 = local_a[masked_tid];
ulong2 v1 = local_a[masked_tid + 1];
ulong2 v2 = local_a[masked_tid + 2];
ulong2 v3 = local_a[masked_tid + 3];
ulong2 v4 = v0 + v1;
ulong2 v5 = v0 + v2;
ulong2 v6 = v0 + v3;
ulong2 v7 = v1 + v2;
ulong2 acc = local_a[0];
for (int i = 0; i < count; i++) {
//uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int64_mul_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global ulong2 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
ulong2 v0 = local_a[masked_tid];
ulong2 v1 = local_a[masked_tid + 1];
ulong2 v2 = local_a[masked_tid + 2];
ulong2 v3 = local_a[masked_tid + 3];
ulong2 v4 = v0 + v1;
ulong2 v5 = v0 + v2;
ulong2 v6 = v0 + v3;
ulong2 v7 = v1 + v2;
ulong2 acc = local_a[0];
for (int i = 0; i < count; i++) {
//uint4 acc = local_a[i & (rate_local_mem_test_size) - 1];
v0 *= acc;
v1 *= acc;
v2 *= acc;
v3 *= acc;
v4 *= acc;
v5 *= acc;
v6 *= acc;
v7 *= acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void mix_fp32_int32_add_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__local int4 local_a[rate_local_mem_test_size];
for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
local_a[i] = convert_int4_sat(A[i]);
barrier(CLK_LOCAL_MEM_FENCE);
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = A[masked_tid];
float4 v1 = A[masked_tid + 1];
float4 v2 = A[masked_tid + 2];
float4 v3 = A[masked_tid + 3];
int4 v4 = convert_int4_sat(v0 + v1);
int4 v5 = convert_int4_sat(v0 + v2);
int4 v6 = convert_int4_sat(v0 + v3);
int4 v7 = convert_int4_sat(v1 + v2);
float4 fp_acc = A[0];
for (int i = 0; i < count; i++) {
int4 int_acc = local_a[i & (rate_local_mem_test_size - 1)];
v0 += fp_acc;
v1 += fp_acc;
v2 += fp_acc;
v3 += fp_acc;
v4 += int_acc;
v5 += int_acc;
v6 += int_acc;
v7 += int_acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);
}
__kernel void mix_fp32_int32_addmul_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *fp32_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = fp32_a[masked_tid];
float4 v1 = fp32_a[masked_tid + 1];
float4 v2 = fp32_a[masked_tid + 2];
float4 v3 = fp32_a[masked_tid + 3];
int4 v4 = convert_int4_sat(v0 + v1);
int4 v5 = convert_int4_sat(v0 + v2);
int4 v6 = convert_int4_sat(v0 + v3);
int4 v7 = convert_int4_sat(v1 + v2);
float4 fp_acc = fp32_a[0];
int4 int_acc = convert_int4_sat(fp32_a[0]);
for (int i = 0; i < count; i++) {
v0 += fp_acc;
v1 += fp_acc;
v2 += fp_acc;
v3 += fp_acc;
v4 *= int_acc;
v5 *= int_acc;
v6 *= int_acc;
v7 *= int_acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7);
}
__kernel void fp32_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = local_a[masked_tid + 4];
float4 v5 = local_a[masked_tid + 5];
float4 v6 = local_a[masked_tid + 6];
float4 v7 = local_a[masked_tid + 7];
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc * v0;
v1 += acc * v1;
v2 += acc * v2;
v3 += acc * v3;
v4 += acc * v4;
v5 += acc * v5;
v6 += acc * v6;
v7 += acc * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_builtin_fma_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = local_a[masked_tid + 4];
float4 v5 = local_a[masked_tid + 5];
float4 v6 = local_a[masked_tid + 6];
float4 v7 = local_a[masked_tid + 7];
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 = fma(acc, v0, v0);
v1 = fma(acc, v1, v1);
v2 = fma(acc, v2, v2);
v3 = fma(acc, v3, v3);
v4 = fma(acc, v4, v4);
v5 = fma(acc, v5, v5);
v6 = fma(acc, v6, v6);
v7 = fma(acc, v7, v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_mad_rate_test(__global float4 *A, int count, __global float4 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float4 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float4 v0 = local_a[masked_tid];
float4 v1 = local_a[masked_tid + 1];
float4 v2 = local_a[masked_tid + 2];
float4 v3 = local_a[masked_tid + 3];
float4 v4 = v0 + v1;
float4 v5 = v0 + v2;
float4 v6 = v0 + v3;
float4 v7 = v1 + v2;
float4 acc = local_a[0];
for (int i = 0; i < count; i++) {
//float4 acc = local_a[i & (rate_local_mem_test_size) - 1];
v0 = mad(acc, v0, v0);
v1 = mad(acc, v1, v1);
v2 = mad(acc, v2, v2);
v3 = mad(acc, v3, v3);
v4 = mad(acc, v4, v4);
v5 = mad(acc, v5, v5);
v6 = mad(acc, v6, v6);
v7 = mad(acc, v7, v7);
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int16_add_rate_test(__global short8 *A, int count, __global short8 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
//__global short8 *local_a = A;
__local short8 local_a[rate_local_mem_test_size];
for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
int masked_tid = min(tid, rate_local_mem_test_size - 8);
short8 v0 = local_a[masked_tid];
short8 v1 = local_a[masked_tid + 1];
short8 v2 = local_a[masked_tid + 2];
short8 v3 = local_a[masked_tid + 3];
short8 v4 = local_a[masked_tid + 4];
short8 v5 = local_a[masked_tid + 5];
short8 v6 = local_a[masked_tid + 6];
short8 v7 = local_a[masked_tid + 7];
for (int i = 0; i < count; i++) {
short8 acc = local_a[i & (rate_local_mem_test_size - 1)];
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int16_mul_rate_test(__global short8 *A, int count, __global short8 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
//__global short8 *local_a = A;
__local short8 local_a[rate_local_mem_test_size];
for (int i = tid;i < rate_local_mem_test_size; i += max_offset)
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
int masked_tid = min(tid, rate_local_mem_test_size - 8);
short8 v0 = local_a[masked_tid];
short8 v1 = local_a[masked_tid + 1];
short8 v2 = local_a[masked_tid + 2];
short8 v3 = local_a[masked_tid + 3];
short8 v4 = local_a[masked_tid + 4];
short8 v5 = local_a[masked_tid + 5];
short8 v6 = local_a[masked_tid + 6];
short8 v7 = local_a[masked_tid + 7];
for (int i = 0; i < count; i++) {
short8 acc = local_a[i & (rate_local_mem_test_size - 1)];
v0 *= acc;
v1 *= acc;
v2 *= acc;
v3 *= acc;
v4 *= acc;
v5 *= acc;
v6 *= acc;
v7 *= acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int8_add_rate_test(__global char16 *A, int count, __global char16 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global char16 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
char16 v0 = local_a[masked_tid];
char16 v1 = local_a[masked_tid + 1];
char16 v2 = local_a[masked_tid + 2];
char16 v3 = local_a[masked_tid + 3];
char16 v4 = v0 + v1;
char16 v5 = v0 + v2;
char16 v6 = v0 + v3;
char16 v7 = v1 + v2;
char16 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int8_mul_rate_test(__global char16 *A, int count, __global char16 *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global char16 *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
char16 v0 = local_a[masked_tid];
char16 v1 = local_a[masked_tid + 1];
char16 v2 = local_a[masked_tid + 2];
char16 v3 = local_a[masked_tid + 3];
char16 v4 = v0 + v1;
char16 v5 = v0 + v2;
char16 v6 = v0 + v3;
char16 v7 = v1 + v2;
char16 acc = local_a[0];
for (int i = 0; i < count; i++) {
v0 *= acc;
v1 *= acc;
v2 *= acc;
v3 *= acc;
v4 *= acc;
v5 *= acc;
v6 *= acc;
v7 *= acc;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_fma_latency_test(__global float *A, int count, __global float *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_add_latency_test(__global float *A, int count, __global float *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int32_add_latency_test(__global uint *A, int count, __global uint *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
int masked_tid = tid & (rate_local_mem_test_size - 1);
uint v0 = A[masked_tid];
uint v1 = A[masked_tid + 1];
uint v2 = A[masked_tid + 2];
uint v3 = A[masked_tid + 3];
uint v4 = v0 + v1;
uint v5 = v0 + v2;
uint v6 = v0 + v3;
uint v7 = v1 + v2;
for (int i = 0; i < count; i += 4) {
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int32_mul_latency_test(__global uint *A, int count, __global uint *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global uint *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
uint v0 = local_a[masked_tid];
uint v1 = local_a[masked_tid + 1];
uint v2 = local_a[masked_tid + 2];
uint v3 = local_a[masked_tid + 3];
uint v4 = v0 + v1;
uint v5 = v0 + v2;
uint v6 = v0 + v3;
uint v7 = v1 + v2;
uint acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_divergence_rate_test(__global float *A, int count, __global float *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = A[0];
float op = A[get_global_id(0)];
if (op < 1.0) {
for (int i = 0; i < count; i++) {
if (op < 0.5) {
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
else
{
v0 *= acc;
v1 *= acc;
v2 *= acc;
v3 *= acc;
v4 *= acc;
v5 *= acc;
v6 *= acc;
v7 *= acc;
}
}
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_partition_rate_test(__global float *A, int count, __global float *ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float *local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = A[0];
float op = A[get_global_id(0)];
if (op < 1.0) {
for (int i = 0; i < count; i++) {
v0 += acc;
v1 += acc;
v2 += acc;
v3 += acc;
v4 += acc;
v5 += acc;
v6 += acc;
v7 += acc;
}
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
/// Scalar latency
__kernel void int32_add_scalar_latency_test(__global uint* A, int count, __global uint* ret) {
int tid = 0;
int max_offset = get_local_size(0);
int masked_tid = tid & (rate_local_mem_test_size - 1);
uint v0 = A[masked_tid];
uint v1 = A[masked_tid + 1];
uint v2 = A[masked_tid + 2];
uint v3 = A[masked_tid + 3];
uint v4 = v0 + v1;
uint v5 = v0 + v2;
uint v6 = v0 + v3;
uint v7 = v1 + v2;
for (int i = 0; i < count; i += 4) {
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void int32_mul_scalar_latency_test(__global uint* A, int count, __global uint* ret) {
int tid = 0;
int max_offset = get_local_size(0);
__global uint* local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
uint v0 = local_a[masked_tid];
uint v1 = local_a[masked_tid + 1];
uint v2 = local_a[masked_tid + 2];
uint v3 = local_a[masked_tid + 3];
uint v4 = v0 + v1;
uint v5 = v0 + v2;
uint v6 = v0 + v3;
uint v7 = v1 + v2;
uint acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_add_scalar_latency_test(__global float* A, int count, __global float* ret) {
int tid = 0;
int max_offset = get_local_size(0);
__global float* local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 8) {
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
v0 = v7 + v0;
v1 = v0 + v1;
v2 = v1 + v2;
v3 = v2 + v3;
v4 = v3 + v4;
v5 = v4 + v5;
v6 = v5 + v6;
v7 = v6 + v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_fma_scalar_latency_test(__global float* A, int count, __global float* ret) {
int tid = 0;
int max_offset = get_local_size(0);
__global float* local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
v0 = v7 + acc * v0;
v1 = v0 + acc * v1;
v2 = v1 + acc * v2;
v3 = v2 + acc * v3;
v4 = v3 + acc * v4;
v5 = v4 + acc * v5;
v6 = v5 + acc * v6;
v7 = v6 + acc * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_mul_scalar_latency_test(__global float* A, int count, __global float* ret) {
int tid = 0;
int max_offset = get_local_size(0);
__global float* local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
__kernel void fp32_mul_latency_test(__global float* A, int count, __global float* ret) {
int tid = get_local_id(0);
int max_offset = get_local_size(0);
__global float* local_a = A;
int masked_tid = tid & (rate_local_mem_test_size - 1);
float v0 = local_a[masked_tid];
float v1 = local_a[masked_tid + 1];
float v2 = local_a[masked_tid + 2];
float v3 = local_a[masked_tid + 3];
float v4 = v0 + v1;
float v5 = v0 + v2;
float v6 = v0 + v3;
float v7 = v1 + v2;
float acc = local_a[0];
for (int i = 0; i < count; i += 4) {
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
v0 = v7 * v0;
v1 = v0 * v1;
v2 = v1 * v2;
v3 = v2 * v3;
v4 = v3 * v4;
v5 = v4 * v5;
v6 = v5 * v6;
v7 = v6 * v7;
}
ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7;
}
================================================
FILE: GpuMemLatency/kernel.cl
================================================
// not used, I tried
__constant sampler_t direct_sampler = CLK_NORMALIZED_COORDS_FALSE | // coordinates are from 0 to max dimension size
CLK_ADDRESS_NONE | // if it goes out of bounds feel free to explode and die
CLK_FILTER_NEAREST;
__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {
int localId = get_local_id(0);
// uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up
int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;
uint4 current = read_imageui(A, startPos);
// printf("start x: %u -> %u\n", startPos, current.x);
for (int i = 0; i < count; i += 10) {
// printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4);
//current = read_imageui(A, direct_sampler, i);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
//printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w);
// local_a[localId] = current;
}
ret[get_global_id(0)] = current.x;
}
__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)
CLK_ADDRESS_REPEAT | // going out of bounds = replicate
CLK_FILTER_NEAREST;
__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {
int localId = get_local_id(0);
float pos = get_global_id(0) * native_recip((float)get_global_size(0));
float2 increment;
increment.x = 0.01; // guessing
increment.y = 0.01;
float2 current0, current1, current2, current3;
current0.x = pos;
current0.y = pos;
current1.x = 0.1 + (localId / 10000);
current1.y = 0.1 + (localId / 10000);
current2.x = 0.01 + (localId / 10000);
current2.y = 0.01 + (localId / 10000);
current3.x = 0.002 + (localId / 5000);
current3.y = 0.001 + (localId / 5000);
float4 tmp0 = read_imagef(A, funny_sampler, current0);
float4 tmp1 = read_imagef(A, funny_sampler, current1);
float4 tmp2 = read_imagef(A, funny_sampler, current2);
float4 tmp3 = read_imagef(A, funny_sampler, current3);
for (int i = 0; i < count; i += 4)
{
tmp0 += read_imagef(A, funny_sampler, current0);
tmp1 += read_imagef(A, funny_sampler, current1);
tmp2 += read_imagef(A, funny_sampler, current2);
tmp3 += read_imagef(A, funny_sampler, current3);
current0 += increment;
current1 += increment;
current2 += increment;
current3 += increment;
}
*ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);
}
// Cacheline size in bytes, must correspond to what's defined for the latency test
#define CACHELINE_SIZE 64
// unrolled until terascale no longer saw further improvement (10x unroll)
// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
// not divisible by 10
__kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) {
int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
// Ensures the loaded value will be constant across a workgroup
__kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
// Takes size as an additional argument, meant to run many pointer chasing threads in parallel
// Tries to measure a GPU's latency hiding ability at varying levels of parallelism
__kernel void parallel_latency_test(__global const int* A, int count, int size, __global int* ret) {
size_t threadId = get_global_id(0);
int current = A[threadId % size];
int result = 0;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[threadId] = result;
}
// latency test like the unrolled one above, but with input as constant memory
__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
//int current = A[0];
int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
#define local_mem_test_size 1024
// uses local memory (LDS/shmem)
__kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) {
__local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
// better be fast
for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// everyone else can chill/get masked off
if (get_local_id(0) == 0) {
int current = local_a[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
}
ret[0] = result;
}
}
__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float4 result1 = (0.1f,0.2f,0.3f,0.4f);
float4 result2 = (1.1f,1.2f,1.3f,1.4f);
float4 result3 = (2.1f,2.2f,2.3f,2.4f);
float4 result4 = (3.0f,3.1f,3.2f,3.3f);
float4 result5 = (4.0f,4.2f,4.1f,4.3f);
int initialIdx = startPositions[threadId];
//int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);
//startPositions[threadId] = initialIdx; // for debugging
int idx = initialIdx;
__global float4 *B = (__global float4 *)A;
for (int i = 0; i < count; i += 20) {
result1 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result2 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result3 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result4 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result5 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
}
ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);
}
#define local_mem_bw_test_size 1024
// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats
__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {
__local float local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float acc1 = 1.1;
float acc2 = 2.2;
float acc3 = 3.3;
float acc4 = 4.4;
//printf("subgroup size %d\n", get_sub_group_size());
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += 12) {
acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];
acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];
acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
ret[threadId] = acc1 + acc2 + acc3 + acc4;
}
__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {
__local float4 local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float4 acc1 = A[get_global_id(0) & 0x3FF];
float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += (12*4)) {
acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1];
acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2];
acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);
}
#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B
__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) {
__local ulong local_a[local64_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
// assumes local memory size is at least 512x 64-bit uints
int idx0 = localId;
int idx1 = localId + localSize;
for (int i = 0; i < count; i += 8) {
acc0 ^= local_a[idx0];
acc1 ^= local_a[idx1];
acc2 ^= local_a[idx0 + 1];
acc3 ^= local_a[idx1 + 1];
idx0 = (idx0 + localSize) & 0x1FF;
idx1 = (idx1 + localSize) & 0x1FF;
}
ret[threadId] = acc0 + acc1 + acc2 + acc3;
}
// let's try the method from zhe jia et al
__kernel void local_chase_bw(__global uint* A, uint count, __global uint* ret) {
__local ulong local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
uint sink = localId;
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = 0; i < count; i += 4)
{
sink = local_a[sink];
sink = local_a[sink];
sink = local_a[sink];
sink = local_a[sink];
}
ret[threadId] = sink;
}
#define fixed_tex_test_size 1024
__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
uint4 acc1 = read_imageui(A, 0);
uint4 acc2 = read_imageui(A, 1);
uint4 acc3 = read_imageui(A, 2);
uint4 acc4 = read_imageui(A, 3);
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
// Each read_imageui reads out a 4-wide vector
for (int i = 0; i < count; i += 16) {
read_imageui(A, idx0);
acc1 += read_imageui(A, idx0);
acc2 += read_imageui(A, idx1);
acc3 += read_imageui(A, idx2);
acc4 += read_imageui(A, idx0 + 1);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
float4 out1 = convert_float4(acc1);
float4 out2 = convert_float4(acc2);
float4 out3 = convert_float4(acc3);
float4 out4 = convert_float4(acc4);
ret[threadId] = dot(out1, out2) + dot(out3, out4);
}
// A = inputs, fixed size
__kernel void int_exec_latency_test(__global int* A, int count, __global int* ret) {
int sum = 0;
int input1 = A[0], input2 = A[1], input3 = A[2], input4 = A[3];
for (int i = 0; i < count; i++) {
sum += input1;
sum += input2;
sum += input3;
sum += input4;
sum += input1;
sum += input2;
sum += input3;
sum += input4;
sum += input1;
sum += input2;
sum += input3;
sum += input4;
}
}
// hoping each thread/workgroup lands on a different CU
// A = pointer to location being bounced around
// count = iterations
// ret = sink
// t1 = id of thread 1
// t2 = id of thread 2
__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {
int global_id = get_global_id(0);
int current = 0;
if (global_id == t1) current = 1;
else if (global_id == t2) current = 2;
if (global_id == t1 || global_id == t2) {
//printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current);
while (current <= 2 * count) {
if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
current += 2;
}
}
ret[0] = current;
}
}
__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {
int current = get_global_id(0) + 1;
while (current <= 2 * count) {
if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
current += 2;
}
}
}
__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {
__local int a[1];
int current = get_global_id(0) + 1;
if (current == 1) a[0] = A[0];
barrier(CLK_LOCAL_MEM_FENCE);
while (current <= 2 * count) {
if (atomic_cmpxchg(a, current - 1, current) == current - 1) {
current += 2;
}
}
}
__kernel void dummy_add(__global int* A) {
A[get_global_id(0)]++;
}
================================================
FILE: GpuMemLatency/kernels/atomic_exec_latency_test.cl
================================================
__kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) {
int current = get_global_id(0) + 1;
while (current <= 2 * count) {
if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
current += 2;
}
}
}
__kernel void atomic_add_test(__global int *A, int count) {
int addend = get_global_id(0);
int addend1 = addend + 5;
int addend2 = addend + 6;
int addend3 = addend + 7;
int addend4 = addend + 8;
int addend5 = addend + 9;
int addend6 = addend + 10;
int addend7 = addend + 11;
__global int *target = A + get_global_id(0);
for (int i = 0; i < count; i++)
{
atomic_add(target, addend);
atomic_add(target, addend1);
atomic_add(target, addend2);
atomic_add(target, addend3);
atomic_add(target, addend4);
atomic_add(target, addend5);
atomic_add(target, addend6);
atomic_add(target, addend7);
}
}
================================================
FILE: GpuMemLatency/kernels/buffer_bw_test.cl
================================================
#define fixed_tex_test_size 1024
__kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
uint4 acc1 = read_imageui(A, 0);
uint4 acc2 = read_imageui(A, 1);
uint4 acc3 = read_imageui(A, 2);
uint4 acc4 = read_imageui(A, 3);
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
// Each read_imageui reads out a 4-wide vector
for (int i = 0; i < count; i += 16) {
read_imageui(A, idx0);
acc1 += read_imageui(A, idx0);
acc2 += read_imageui(A, idx1);
acc3 += read_imageui(A, idx2);
acc4 += read_imageui(A, idx0 + 1);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
float4 out1 = convert_float4(acc1);
float4 out2 = convert_float4(acc2);
float4 out3 = convert_float4(acc3);
float4 out4 = convert_float4(acc4);
ret[threadId] = dot(out1, out2) + dot(out3, out4);
}
================================================
FILE: GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl
================================================
// hoping each thread/workgroup lands on a different CU
// A = pointer to location being bounced around
// count = iterations
// ret = sink
// t1 = id of thread 1
// t2 = id of thread 2
__kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) {
int global_id = get_global_id(0);
int current = 0;
if (global_id == t1) current = 1;
else if (global_id == t2) current = 2;
if (global_id == t1 || global_id == t2) {
//printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current);
while (current <= 2 * count) {
if (atomic_cmpxchg(A, current - 1, current) == current - 1) {
current += 2;
}
}
ret[0] = current;
}
}
================================================
FILE: GpuMemLatency/kernels/constant_unrolled_latency_test.cl
================================================
// latency test like the unrolled one above, but with input as constant memory
__kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) {
//int current = A[0];
int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
================================================
FILE: GpuMemLatency/kernels/ldst_bw_test.cl
================================================
#define ldst_bw_test_size 1024
// test load/store bandwidth with a small test size that should fit in L1
/*__kernel void ldst_bw_test(__global float* A, uint count, __global float* ret) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float acc1 = 1.1;
float acc2 = 2.2;
float acc3 = 3.3;
float acc4 = 4.4;
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += 12) {
acc1 += A[idx0] * A[idx1] + A[idx2];
idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);
acc2 += A[idx0] * A[idx1] + A[idx2];
idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);
acc3 += A[idx0] * A[idx1] + A[idx2];
idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);
acc4 += A[idx0] * A[idx1] + A[idx2];
idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size);
idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size);
idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size);
}
ret[threadId] = acc1 + acc2 + acc3 + acc4;
}*/
__kernel void ldst_bw_test(__global float4* A, uint count, __global float* ret) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float acc1 = 1.1;
float acc2 = 2.2;
float acc3 = 3.3;
float acc4 = 4.4;
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = idx0 + localSize;
int idx2 = idx1 + localSize;
int idx3 = idx2 + localSize;
for (int i = 0; i < count; i += (16*4)) {
acc1 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
idx3 = (idx3 + localSize) & 0x3FF;
acc2 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
idx3 = (idx3 + localSize) & 0x3FF;
acc3 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
idx3 = (idx3 + localSize) & 0x3FF;
acc4 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]);
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
idx3 = (idx3 + localSize) & 0x3FF;
}
ret[threadId] = acc1 + acc2 + acc3 + acc4;
}
================================================
FILE: GpuMemLatency/kernels/local_64_bw_test.cl
================================================
#define local64_test_size 2048 // size was given in 4B elements. This test uses 8B
__kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) {
__local ulong local_a[local64_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0;
// assumes local memory size is at least 512x 64-bit uints
int idx0 = localId;
int idx1 = localId + localSize;
for (int i = 0; i < count; i += 8) {
acc0 ^= local_a[idx0];
acc1 ^= local_a[idx1];
idx0 = (idx0 + localSize) & 0x1FF;
idx1 = (idx1 + localSize) & 0x1FF;
acc3 ^= local_a[idx0];
acc4 ^= local_a[idx1];
idx0 = (idx0 + localSize) & 0x1FF;
idx1 = (idx1 + localSize) & 0x1FF;
}
ret[threadId] = acc0 + acc1 + acc2 + acc3;
}
================================================
FILE: GpuMemLatency/kernels/local_atomic_latency_test.cl
================================================
__kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) {
__local int a[1];
int current = get_global_id(0) + 1;
if (current == 1) a[0] = A[0];
barrier(CLK_LOCAL_MEM_FENCE);
while (current <= 2 * count) {
if (atomic_cmpxchg(a, current - 1, current) == current - 1) {
current += 2;
}
}
}
#define local_atomic_add_wg_size 256
__kernel void local_atomic_add_test(__global int *A, int count) {
__local int local_a[local_atomic_add_wg_size];
local_a[get_local_id(0)] = A[get_global_id(0)];
barrier(CLK_LOCAL_MEM_FENCE);
int addend = get_global_id(0);
int addend1 = addend + 5;
int addend2 = addend + 6;
int addend3 = addend + 7;
int addend4 = addend + 8;
int addend5 = addend + 9;
int addend6 = addend + 10;
int addend7 = addend + 11;
__local int *target = local_a + get_local_id(0);
for (int i = 0; i < count; i++)
{
atomic_add(target, addend);
atomic_add(target, addend1);
atomic_add(target, addend2);
atomic_add(target, addend3);
atomic_add(target, addend4);
atomic_add(target, addend5);
atomic_add(target, addend6);
atomic_add(target, addend7);
}
A[get_global_id(0)] = local_a[get_local_id(0)];
}
================================================
FILE: GpuMemLatency/kernels/local_bw_test.cl
================================================
#define local_mem_bw_test_size 1024
// test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats
__kernel void local_bw_test(__global float* A, uint count, __global float* ret) {
__local float local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float acc1 = 1.1;
float acc2 = 2.2;
float acc3 = 3.3;
float acc4 = 4.4;
//printf("subgroup size %d\n", get_sub_group_size());
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += 12) {
acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
ret[threadId] = acc1 + acc2 + acc3 + acc4;
}
================================================
FILE: GpuMemLatency/kernels/local_float4_bw_test.cl
================================================
#define local_mem_bw_test_size 1024
__kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) {
__local float4 local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float4 acc1 = A[get_global_id(0) & 0x3FF];
float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0); i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += (12 * 4)) {
acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4);
}
__kernel void mixed_float4_bw_test(__global float4* A, uint count, __global float* ret) {
__local float4 local_a[local_mem_bw_test_size];
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float4 acc1 = A[get_global_id(0) & 0x3FF];
float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF];
float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF];
float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF];
float4 acc5 = A[(get_global_id(0) + 4) & 0x3FF];
float4 acc6 = A[(get_global_id(0) + 5) & 0x3FF];
float4 acc7 = A[(get_global_id(0) + 6) & 0x3FF];
float4 acc8 = A[(get_global_id(0) + 7) & 0x3FF];
// workgroup-wide copy from global mem into local mem
for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// assumes local memory size is at least 1024 float4s
int idx0 = localId;
int idx1 = localId + localSize;
int idx2 = localId + localSize * 2;
for (int i = 0; i < count; i += (16*4)) {
local_a[idx0] += A[idx1] * A[idx2]; // 4 * (3R 1W)
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
local_a[idx0] += A[idx1] * A[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
local_a[idx0] += A[idx1] * A[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
local_a[idx0] += A[idx1] * A[idx2];
idx0 = (idx0 + localSize) & 0x3FF;
idx1 = (idx1 + localSize) & 0x3FF;
idx2 = (idx2 + localSize) & 0x3FF;
}
ret[threadId] = dot(local_a[get_local_id(0)], local_a[get_local_id(0) + 1]);
}
================================================
FILE: GpuMemLatency/kernels/local_unrolled_latency_test.cl
================================================
#define local_mem_test_size 1024
// uses local memory (LDS/shmem)
__kernel void local_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
__local uint local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite?
// better be fast
for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0))
local_a[i] = A[i];
barrier(CLK_LOCAL_MEM_FENCE);
// everyone else can chill/get masked off
if (get_local_id(0) == 0) {
uint current = local_a[0];
uint result;
for (int i = 0; i < count; i += 10) {
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
result += current;
current = local_a[current];
}
ret[0] = result;
}
}
================================================
FILE: GpuMemLatency/kernels/scalar_unrolled_latency_test.cl
================================================
// Ensures the loaded value will be constant across a workgroup
__kernel void scalar_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
uint current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0];
uint result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
================================================
FILE: GpuMemLatency/kernels/sum_bw_test.cl
================================================
__kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) {
int threadId = get_global_id(0);
int localId = get_local_id(0);
int localSize = get_local_size(0);
int groupId = get_group_id(0);
float4 result1 = (0.1f,0.2f,0.3f,0.4f);
float4 result2 = (1.1f,1.2f,1.3f,1.4f);
float4 result3 = (2.1f,2.2f,2.3f,2.4f);
float4 result4 = (3.0f,3.1f,3.2f,3.3f);
float4 result5 = (4.0f,4.2f,4.1f,4.3f);
int initialIdx = startPositions[threadId];
//int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1);
//startPositions[threadId] = initialIdx; // for debugging
int idx = initialIdx;
__global float4 *B = (__global float4 *)A;
for (int i = 0; i < count; i += 20) {
result1 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result2 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result3 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result4 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
result5 += B[idx];
idx += localSize;
if (idx >= float4size) idx = initialIdx;
}
ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5);
}
================================================
FILE: GpuMemLatency/kernels/tex_bw_test.cl
================================================
__constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float)
CLK_ADDRESS_REPEAT | // going out of bounds = replicate
CLK_FILTER_NEAREST;
__kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) {
int localId = get_local_id(0);
float pos = get_global_id(0) * native_recip((float)get_global_size(0));
float2 increment;
increment.x = 0.01; // guessing
increment.y = 0.01;
float2 current0, current1, current2, current3;
current0.x = pos;
current0.y = pos;
current1.x = 0.1 + (localId / 10000);
current1.y = 0.1 + (localId / 10000);
current2.x = 0.01 + (localId / 10000);
current2.y = 0.01 + (localId / 10000);
current3.x = 0.002 + (localId / 5000);
current3.y = 0.001 + (localId / 5000);
float4 tmp0 = read_imagef(A, funny_sampler, current0);
float4 tmp1 = read_imagef(A, funny_sampler, current1);
float4 tmp2 = read_imagef(A, funny_sampler, current2);
float4 tmp3 = read_imagef(A, funny_sampler, current3);
for (int i = 0; i < count; i += 4)
{
tmp0 += read_imagef(A, funny_sampler, current0);
tmp1 += read_imagef(A, funny_sampler, current1);
tmp2 += read_imagef(A, funny_sampler, current2);
tmp3 += read_imagef(A, funny_sampler, current3);
current0 += increment;
current1 += increment;
current2 += increment;
current3 += increment;
}
*ret = dot(tmp0, tmp1) + dot(tmp2, tmp3);
}
================================================
FILE: GpuMemLatency/kernels/tex_latency_test.cl
================================================
__kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) {
int localId = get_local_id(0);
// uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up
int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0;
uint4 current = read_imageui(A, startPos);
// printf("start x: %u -> %u\n", startPos, current.x);
for (int i = 0; i < count; i += 10) {
// printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4);
//current = read_imageui(A, direct_sampler, i);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
current = read_imageui(A, current.x);
//printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w);
// local_a[localId] = current;
}
ret[get_global_id(0)] = current.x;
}
================================================
FILE: GpuMemLatency/kernels/unrolled_latency_test.cl
================================================
// unrolled until terascale no longer saw further improvement (10x unroll)
// assumes count will be a multiple of 10. but it won't be too inaccurate with a big count
// not divisible by 10
__kernel void unrolled_latency_test(__global const uint* A, int count, __global uint* ret) {
uint current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency
uint result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
}
ret[0] = result;
}
================================================
FILE: GpuMemLatency/latency_test.c
================================================
#include "opencltest.h"
// list_size = number of 4B (32-bit) elements
float latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t list_size,
uint32_t chase_iterations,
short uniform,
int threads,
int local_size,
int wave_size,
int stride,
uint32_t *elapsed_ms)
{
size_t global_item_size = 1, local_item_size = 1;
cl_int ret;
float latency;
int64_t time_diff_ms;
uint32_t result;
if (threads && local_size)
{
local_item_size = local_size;
global_item_size = threads;
}
// fprintf(stderr, "Testing latency with %d threads %d local size %d list size\n", threads, local_size, list_size);
// Sanity Checks
if (!uniform && ((stride * 2 > list_size * 4) || // 2 cache lines
((threads > 1) && (stride * 2 > (list_size * 4 / (threads / wave_size)))))) // handle partition case
{
fprintf(stderr, "Less than 2 lines will be visited with stride %d, list size %dx 32-bit INTs\n", stride, list_size);
return 1.0f;
}
// Fill pattern arr
uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);
uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));
memset(A, 0, sizeof(uint32_t) * list_size);
if (threads < 2 || uniform) {
FillPatternArr(A, list_size, stride);
thread_start[0] = 0;
}
else
{
if (wave_size <= 1) wave_size = 1;
// partition pattern arr, creating a section for each wave
int wave_count = threads / wave_size;
int sub_list_size = list_size / wave_count;
for (int waveId = 0; waveId < wave_count; waveId++)
{
int waveId_start = sub_list_size * waveId;
thread_start[wave_size * waveId] = waveId_start;
FillPatternArr(A + waveId_start, sub_list_size, stride);
// fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start);
// offset indices
for (int subIdx = 0; subIdx < sub_list_size; subIdx++)
{
A[waveId_start + subIdx] += waveId_start;
}
}
// make sure all threads in a wave access the same item
for (int i = 1; i < threads; i++)
{
int waveId = i / wave_size;
thread_start[i] = thread_start[waveId * wave_size];
//fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]);
}
}
// copy array to device
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);
clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);
clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
clFinish(command_queue);
// Set kernel arguments
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to set list as kernel arg. clSetKernelArg returned %d\n", ret);
latency = 0;
goto cleanup;
}
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
start_timing();
// Execute the OpenCL kernel. launch a single thread
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
goto cleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
latency = 0;
goto cleanup;
}
time_diff_ms = end_timing();
if (elapsed_ms != NULL) *elapsed_ms = time_diff_ms;
latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t), &result, 0, NULL, NULL);
clFinish(command_queue);
//fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
free(A);
return latency;
}
float tex_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t list_size,
uint32_t chase_iterations,
int threads,
int local_size,
int wave_size)
{
size_t global_item_size = 1, local_item_size = 1;
cl_int ret = 0;
uint32_t result;
cl_mem a_mem_obj = NULL, result_obj = NULL, tex_obj = NULL;
float latency = 0;
if (threads > 1)
{
global_item_size = threads;
local_item_size = local_size;
}
uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size);
uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size));
memset(A, 0, sizeof(uint32_t) * list_size);
if (threads < 2) {
FillPatternArr(A, list_size, CACHELINE_SIZE);
thread_start[0] = 0;
}
else
{
if (wave_size <= 1) wave_size = 1;
// partition pattern arr, creating a section for each wave
int wave_count = threads / wave_size;
int sub_list_size = list_size / wave_count;
for (int waveId = 0; waveId < wave_count; waveId++)
{
int waveId_start = sub_list_size * waveId;
thread_start[wave_size * waveId] = waveId_start;
FillPatternArr(A + waveId_start, sub_list_size, CACHELINE_SIZE);
// fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start);
// offset indices
for (int subIdx = 0; subIdx < sub_list_size; subIdx++)
{
A[waveId_start + subIdx] += waveId_start;
}
}
// make sure all threads in a wave access the same item
for (int i = 1; i < threads; i++)
{
int waveId = i / wave_size;
thread_start[i] = thread_start[waveId * wave_size];
//fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]);
}
}
// use buffer as texture
a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret);
clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL);
clFinish(command_queue);
cl_image_format imageFormat;
imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
imageFormat.image_channel_order = CL_R;
cl_image_desc imageDesc;
memset(&imageDesc, 0, sizeof(cl_image_desc));
imageDesc.buffer = a_mem_obj;
imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
imageDesc.image_width = list_size; // width in pixels
//imageDesc.image_height = 1; // not used for 1D image
//imageDesc.image_depth = 1; // not used for 1D image
//imageDesc.mem_object = a_mem_obj;
tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to create image: %d\n", ret);
goto texLatencyCleanup;
}
size_t origin[] = { 0, 0, 0 };
size_t region[] = { imageDesc.image_width, 1, 1 };
ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to copy image: %d\n", ret);
goto texLatencyCleanup;
}
result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret);
clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
clFinish(command_queue);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&list_size);
start_timing();
// Execute the OpenCL kernel
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
goto texLatencyCleanup;
}
ret = clFinish(command_queue); // returns success even when TDR happens?
if (ret != CL_SUCCESS)
{
printf("Failed to finish command queue. clFinish returned %d\n", ret);
latency = 0;
goto texLatencyCleanup;
}
uint64_t time_diff_ms = end_timing();
latency = 1e6 * (float)time_diff_ms / (float)chase_iterations;
ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL);
clFinish(command_queue);
// for (int i = 0; i < global_item_size; i++) fprintf(stderr, "Thread %d ended at %d\n", i, thread_start[i]);
texLatencyCleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(tex_obj);
clReleaseMemObject(result_obj);
free(A);
return latency;
}
================================================
FILE: GpuMemLatency/local_mem_latency_kernel.cl
================================================
// for testing total local memory capacity by seeing when threads can no longer overlap in time
// due to local mem capacity limits across the GPU
// calling code expected to define LATENCY_LOCAL_MEM_SIZE
__kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) {
__local int local_a[LATENCY_LOCAL_MEM_SIZE];
int start = A[0]; // this will test scalar latency, always
int current = A[start];
int result;
for (int i = 0; i < count; i += 10) {
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
result += current;
current = A[current];
local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current;
}
ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)];
}
================================================
FILE: GpuMemLatency/opencltest.c
================================================
#include "opencltest.h"
// default test sizes for latency, in KB
int default_test_sizes[] = { 1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 144, 160, 172, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144,
8192, 16384, 18432, 20480, 24576, 25600, 28672, 32768, 36864, 40960, 41200, 49152, 65536, 98304, 131072, 196608, 262144, 524288, 768432, 819200, 921600, 1048576 };
// lining this up with nemes's VK bw test sizes. units for this one are in bytes
const uint64_t default_bw_test_sizes[] = {
4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688, 131072,
196608, 262144, 393216, 458752, 524288, 786432, 1048576, 1572864, 2097152, 3145728, 4194304, 6291456, 8388608, 12582912, 16777216, 20971520,
25165824, 33554432, 37748736, 41943040, 50331648, 58720256, 67108864, 100663296, 134217728, 201326592, 268435456, 402653184, 536870912, 805306368,
1073741824, 1610579968, 2147483648, 3221225472, 4294967296
};
float int_exec_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations);
uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb);
uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations);
cl_ulong get_max_buffer_size();
cl_ulong get_max_constant_buffer_size();
enum TestType {
VectorMemLatency,
ScalarMemLatency,
ConstantMemLatency,
LocalMemCapacity,
LocalMemLatency,
TexMemLatency,
GlobalAtomicLatency,
LocalAtomicLatency,
GlobalAtomicAdd,
LocalAtomicAdd,
GlobalMemBandwidth,
LocalMemBandwidth,
LocalMemChaseBandwidth,
LocalMem64Bandwidth,
LocalMemFloat4Bandwidth,
MixedFloat4Bandwidth,
LoadStoreBandwidth,
TextureThroughput,
BufferBandwidth,
MemBandwidthWorkgroupScaling,
CoreToCore,
LinkBandwidth,
InstructionRate,
Divergence,
Partition,
MemDivergence
};
int main(int argc, char* argv[]) {
cl_int ret;
uint32_t stride = 64;
uint32_t list_size = 3840 * 2160 * 4;
uint32_t chase_iterations = 1e6 * 7;
// skip = 0 means auto
uint32_t thread_count = 1, local_size = 1, skip = 0, wave = 0;
float result;
int platform_index = -1, device_index = -1;
enum TestType testType = VectorMemLatency;
char thread_count_set = 0, local_size_set = 0, chase_iterations_set = 0, skip_set = 0;
int sizeKb = 0;
int forceCuCount = 0;
int forcefp16 = 0, forcefp64 = 0;
// vars for local mem capacity testing
int local_mem_size_kb = 0; // local mem allocated for each wg
int group_count = 0; // max wg count
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char* arg = argv[argIdx] + 1;
if (_strnicmp(arg, "stride", 6) == 0) {
argIdx++;
stride = atoi(argv[argIdx]);
fprintf(stderr, "Using stride = %u\n", stride);
}
else if (_strnicmp(arg, "iterations", 10) == 0) {
argIdx++;
chase_iterations = atoi(argv[argIdx]);
chase_iterations_set = 1;
fprintf(stderr, "Using %u iterations\n", chase_iterations);
}
else if (_strnicmp(arg, "threads", 7) == 0) {
argIdx++;
thread_count = atoi(argv[argIdx]);
thread_count_set = 1;
fprintf(stderr, "Using %u threads\n", thread_count);
}
else if (_strnicmp(arg, "localsize", 9) == 0) {
argIdx++;
local_size = atoi(argv[argIdx]);
local_size_set = 1;
fprintf(stderr, "Using local size = %u\n", local_size);
}
else if (_strnicmp(arg, "wave", 4) == 0) {
argIdx++;
wave = atoi(argv[argIdx]);
fprintf(stderr, "Estimated wave size = %u\n", wave);
}
else if (_strnicmp(arg, "platform", 8) == 0) {
argIdx++;
platform_index = atoi(argv[argIdx]);
fprintf(stderr, "Using OpenCL platform index %d\n", platform_index);
}
else if (_strnicmp(arg, "device", 6) == 0) {
argIdx++;
device_index = atoi(argv[argIdx]);
fprintf(stderr, "Using OpenCL device index %d\n", device_index);
}
else if (_strnicmp(arg, "bwskip", 6) == 0) {
argIdx++;
skip = atoi(argv[argIdx]);
fprintf(stderr, "Workgroups will be spaced %u apart\n", skip);
}
else if (_strnicmp(arg, "sizekb", 6) == 0) {
argIdx++;
sizeKb = atoi(argv[argIdx]);
fprintf(stderr, "Only testing %d KB\n", sizeKb);
}
else if (_strnicmp(arg, "localmemsize", 12) == 0)
{
argIdx++;
local_mem_size_kb = atoi(argv[argIdx]);
fprintf(stderr, "Testing with %d of local memory allocated per WG\n", local_mem_size_kb);
}
else if (_strnicmp(arg, "groupcount", 10) == 0)
{
argIdx++;
group_count = atoi(argv[argIdx]);
fprintf(stderr, "Testing with up to %d WGs\n", group_count);
}
else if (_strnicmp(arg, "saveprogram", 11) == 0) {
saveprogram = 1;
fprintf(stderr, "Writing compiled program to disk\n");
}
else if (_strnicmp(arg, "forcefp16", 10) == 0) {
forcefp16 = 1;
fprintf(stderr, "For instruction rate testing, will run FP16 tests regardless of whether support is advertised\n");
}
else if (_strnicmp(arg, "forcefp64", 10) == 0) {
forcefp64 = 1;
fprintf(stderr, "For instruction rate testing, will run FP64 tests regardless of whether support is advertised\n");
}
else if (_strnicmp(arg, "test", 4) == 0) {
argIdx++;
if (_strnicmp(argv[argIdx], "vectorlatency", 13) == 0) {
testType = VectorMemLatency;
fprintf(stderr, "Testing global memory latency, vector accesses\n");
}
else if (_strnicmp(argv[argIdx], "scalarlatency", 13) == 0) {
testType = ScalarMemLatency;
fprintf(stderr, "Testing global memory latency, scalar accesses\n");
}
else if (_strnicmp(argv[argIdx], "constantlatency", 15) == 0) {
testType = ConstantMemLatency;
fprintf(stderr, "Testing constant memory latency\n");
}
else if (_strnicmp(argv[argIdx], "memdivergence", 13) == 0) {
testType = MemDivergence;
fprintf(stderr, "Testing memory access divergence cost\n");
}
else if (_strnicmp(argv[argIdx], "localmemcapacity", 16) == 0) {
testType = LocalMemCapacity;
fprintf(stderr, "Testing GPU-wide local memory capacity. Make sure localmemsize/groupcount are set appropriately!\n");
if (sizeKb == 0) sizeKb = 1;
if (group_count == 0) group_count = 16;
}
else if (_strnicmp(argv[argIdx], "globalatomiccmpxchg", 19) == 0) {
testType = GlobalAtomicLatency;
fprintf(stderr, "Testing global atomic latency (cmpxchg)\n");
}
else if (_strnicmp(argv[argIdx], "globalatomicadd", 15) == 0)
{
testType = GlobalAtomicAdd;
fprintf(stderr, "Testing global atomic add\n");
}
else if (_strnicmp(argv[argIdx], "locallatency", 13) == 0) {
testType = LocalMemLatency;
fprintf(stderr, "Testing local mem latency\n");
}
else if (_strnicmp(argv[argIdx], "texlatency", 10) == 0) {
testType = TexMemLatency;
fprintf(stderr, "Testing texture mem latency\n");
}
else if (_strnicmp(argv[argIdx], "localatomiccmpxchg", 18) == 0) {
testType = LocalAtomicLatency;
fprintf(stderr, "Testing local atomic latency (cmpxchg)\n");
}
else if (_strnicmp(argv[argIdx], "localatomicadd", 14) == 0) {
testType = LocalAtomicAdd;
fprintf(stderr, "Testing local atomic add\n");
}
else if (_strnicmp(argv[argIdx], "bw", 2) == 0) {
testType = GlobalMemBandwidth;
fprintf(stderr, "Testing global memory bandwidth\n");
// Somewhat reasonable defaults
if (!thread_count_set) thread_count = 131072;
if (!local_size_set) local_size = 256;
if (!chase_iterations_set) chase_iterations = 500000;
}
else if (_strnicmp(argv[argIdx], "localbw", 7) == 0) {
testType = LocalMemBandwidth;
if (!thread_count_set) thread_count = 262144;
if (!local_size_set) local_size = 256;
fprintf(stderr, "Testing local memory bandwidth\n");
}
else if (_strnicmp(argv[argIdx], "localchasebw", 12) == 0) {
testType = LocalMemChaseBandwidth;
fprintf(stderr, "Testing local memory bandwidth using pointer chasing and lots of waves\n");
}
else if (_strnicmp(argv[argIdx], "local64bw", 9) == 0) {
testType = LocalMem64Bandwidth;
fprintf(stderr, "Testing local memory bandwidth using 64-bit loads\n");
}
else if (_strnicmp(argv[argIdx], "localfloat4bw", 13) == 0) {
testType = LocalMemFloat4Bandwidth;
fprintf(stderr, "Testing local memory bandwidth using float4 (4x32-bit) loads\n");
}
else if (_strnicmp(argv[argIdx], "mixedbw", 7) == 0) {
testType = MixedFloat4Bandwidth;
fprintf(stderr, "Mixed local/global load bw test with float4\n");
}
else if (_strnicmp(argv[argIdx], "bufferbw", 8) == 0) {
testType = BufferBandwidth;
fprintf(stderr, "Testing buffer bandwidth\n");
}
else if (_strnicmp(argv[argIdx], "ldstbw", 6) == 0) {
testType = LoadStoreBandwidth;
fprintf(stderr, "Testing load/store bandwidth\n");
}
else if (_strnicmp(argv[argIdx], "scaling", 7) == 0)
{
testType = MemBandwidthWorkgroupScaling;
fprintf(stderr, "Testing BW scaling with workgroups\n");
if (!chase_iterations_set) chase_iterations = 20000000;
if (argIdx + 1 < argc && argv[argIdx + 1][0] != '-')
{
argIdx++;
forceCuCount = atoi(argv[argIdx]);
fprintf(stderr, "Using up to %d workgroups\n", forceCuCount);
}
}
else if (_strnicmp(argv[argIdx], "c2c", 3) == 0)
{
testType = CoreToCore;
fprintf(stderr, "Testing latency with global atomics across CU count\n");
}
else if (_strnicmp(argv[argIdx], "link", 4) == 0)
{
testType = LinkBandwidth;
fprintf(stderr, "Testing host <-> GPU link bandwidth\n");
if (!chase_iterations_set) chase_iterations = 30000000;
}
else if (_strnicmp(argv[argIdx], "instructionrate", 15) == 0)
{
testType = InstructionRate;
fprintf(stderr, "Testing instruction rate\n");
if (!chase_iterations_set) chase_iterations = 1000;
if (!local_size_set && !thread_count_set) {
local_size = 256;
thread_count = 32768;
fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count);
}
}
else if (_strnicmp(argv[argIdx], "tmu", 3) == 0)
{
testType = TextureThroughput;
fprintf(stderr, "Testing TMUs\n");
}
else if (_strnicmp(argv[argIdx], "divergence", 10) == 0)
{
testType = Divergence;
fprintf(stderr, "Testing compute throughput with varying numbers of consecutive threads doing the same op\n");
if (!local_size_set && !thread_count_set) {
local_size = 256;
thread_count = 32768;
fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count);
}
}
else if (_strnicmp(argv[argIdx], "partition", 9) == 0)
{
testType = Partition;
fprintf(stderr, "Testing execution unit partitioning. Make sure wave size is set!\n");
}
else {
fprintf(stderr, "I'm so confused. Unknown test type %s\n", argv[argIdx]);
}
}
}
}
if (argc == 1)
{
fprintf(stderr, "Usage:\n\t[-test ]\n\t[-platform ]\n\t[-device ]\n");
fprintf(stderr, "\t[-threads ]\n\t[-localsize ]\n\t[-bwskip ]\n");
fprintf(stderr, "Number of threads (OpenCL global work size) must be divisible by local work size\n");
}
fprintf(stderr, "Using %d threads with local size %d\n", thread_count, local_size);
#pragma region opencl_overhead
// Create an OpenCL context
cl_context context = get_context_from_user(platform_index, device_index);
if (context == NULL) exit(1);
// Load kernel
cl_program program = build_program(context, "kernel.cl", NULL);
if (saveprogram) write_program(program, "kernel");
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, selected_device_id, 0, &ret);
fprintf(stderr, "clCreateCommandQueue returned %d\n", ret);
cl_kernel c2c_atomic_latency_test_kernel = clCreateKernel(program, "c2c_atomic_exec_latency_test", &ret);
cl_kernel dummy_add_kernel = clCreateKernel(program, "dummy_add", &ret);
cl_kernel local_bw_chase_kernel = clCreateKernel(program, "local_chase_kernel", &ret);
#pragma endregion opencl_overhead
max_global_test_size = get_max_buffer_size();
if (testType == GlobalAtomicLatency)
{
cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL);
cl_kernel atomic_latency_test_kernel = clCreateKernel(prog, "atomic_exec_latency_test", &ret);
if (saveprogram) write_program(prog, "atomic_exec_latency_test");
chase_iterations = 200000;
uint32_t elapsed_ms = 0, target_ms = 2000;
while (elapsed_ms < target_ms / 2) {
result = int_atomic_latency_test(context, command_queue, atomic_latency_test_kernel, chase_iterations, false, &elapsed_ms);
fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result);
chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);
}
printf("global atomic latency: %f\n", result);
clReleaseKernel(atomic_latency_test_kernel);
clReleaseProgram(prog);
}
else if (testType == LocalAtomicLatency)
{
cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL);
cl_kernel local_atomic_latency_test_kernel = clCreateKernel(prog, "local_atomic_latency_test", &ret);
if (saveprogram) write_program(prog, "local_atomic_latency_test");
chase_iterations = 500000;
uint32_t elapsed_ms = 0, target_ms = 2000;
while (elapsed_ms < target_ms / 2) {
result = int_atomic_latency_test(context, command_queue, local_atomic_latency_test_kernel, chase_iterations, true, &elapsed_ms);
fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result);
chase_iterations = scale_iterations_to_target(chase_iterations, (float)elapsed_ms, (float)target_ms);
}
printf("local atomic latency: %f\n", result);
clReleaseKernel(local_atomic_latency_test_kernel);
clReleaseProgram(prog);
}
else if (testType == GlobalAtomicAdd)
{
cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL);
cl_kernel global_atomic_add_kernel = clCreateKernel(prog, "atomic_add_test", &ret);
if (saveprogram) write_program(prog, "atomic_exec_latency_test");
result = int_atomic_add_test(context, command_queue, global_atomic_add_kernel, thread_count, local_size);
fprintf(stderr, "Global atomic INT32 adds: %f GOPS\n", result);
}
else if (testType == LocalAtomicAdd)
{
cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL);
cl_kernel local_atomic_add_kernel = clCreateKernel(prog, "local_atomic_add_test", &ret);
if (saveprogram) write_program(prog, "local_atomic_latency_test");
result = int_atomic_add_test(context, command_queue, local_atomic_add_kernel, thread_count, local_size);
fprintf(stderr, "Local atomic INT32 adds: %f GOPS\n", result);
}
else if (testType == VectorMemLatency || testType == ScalarMemLatency)
{
cl_program prog;
cl_kernel globalMemLatencyKernel;
if (testType == ScalarMemLatency)
{
prog = build_program(context, "scalar_unrolled_latency_test.cl", NULL);
globalMemLatencyKernel = clCreateKernel(prog, "scalar_unrolled_latency_test", &ret);
if (saveprogram) write_program(prog, "scalar_unrolled_latency_test");
}
else // Vector mem latency
{
prog = build_program(context, "unrolled_latency_test.cl", NULL);
globalMemLatencyKernel = clCreateKernel(prog, "unrolled_latency_test", &ret);
if (saveprogram) write_program(prog, "unrolled_latency_test");
}
fprintf(stderr, "Doing %d K p-chase iterations with stride %d over %d KiB region\n", chase_iterations / 1000, stride, list_size * 4 / 1024);
printf("\nSattolo, global memory latency (up to %llu K) unroll:\n", max_global_test_size / 1024);
for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
if (max_global_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {
printf("%d K would exceed device's max buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_global_test_size / 1024);
break;
}
result = latency_test(context, command_queue,
globalMemLatencyKernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);
printf("%d,%f\n", default_test_sizes[size_idx], result);
if (result == 0) {
printf("Something went wrong, not testing anything bigger.\n");
break;
}
}
clReleaseKernel(globalMemLatencyKernel);
clReleaseProgram(prog);
}
else if (testType == MemDivergence) {
cl_program vecProg, texProg;
cl_kernel vecKernel, texKernel;
fprintf(stderr, "Testing mem divergence with localsize %d, test size %d KB\n", local_size, sizeKb);
// vector
vecProg = build_program(context, "unrolled_latency_test.cl", NULL);
if (saveprogram) write_program(vecProg, "vector_unrolled_latency_test");
vecKernel = clCreateKernel(vecProg, "unrolled_latency_test", &ret);
texProg = build_program(context, "tex_latency_test.cl", NULL);
texKernel = clCreateKernel(texProg, "tex_latency_test", &ret);
if (saveprogram) write_program(texProg, "tex_latency_test");
float* memDivergenceResults = (float*)malloc(sizeof(float) * local_size * 2);
for (int threadCount = 1; threadCount <= local_size; threadCount++) {
float vecResult = latency_test(context, command_queue, vecKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), false, threadCount, threadCount, 1, stride, NULL);
memDivergenceResults[threadCount * 2] = vecResult;
float texResult = tex_latency_test(context, command_queue, texKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), threadCount, threadCount, 1);
memDivergenceResults[threadCount * 2 + 1] = texResult;
fprintf(stderr, "%d threads: %f vec, %f tex\n", threadCount, vecResult, texResult);
}
for (int threadCount = 1; threadCount <= local_size; threadCount++) {
printf("%d,%f,%f\n", threadCount, memDivergenceResults[threadCount * 2], memDivergenceResults[threadCount * 2 + 1]);
}
clReleaseKernel(texKernel);
clReleaseKernel(vecKernel);
clReleaseProgram(texProg);
clReleaseProgram(vecProg);
free(memDivergenceResults);
}
else if (testType == LocalMemCapacity)
{
char build_options[128];
const char* local_mem_define_prefix = "-D LATENCY_LOCAL_MEM_SIZE=";
memset(build_options, 0, 128);
memcpy(build_options, local_mem_define_prefix, 26);
snprintf(build_options + 26, 128 - 26, "%u", 256 * local_mem_size_kb);
cl_program program = build_program(context, "local_mem_latency_kernel.cl", build_options);
cl_kernel local_mem_capacity_kernel = clCreateKernel(program, "unrolled_latency_test_localmem", &ret);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Could not create local mem capacity testing kernel\n");
exit(0);
}
if (saveprogram) write_program(program, "local_mem_latency_kernel");
fprintf(stderr, "Testing local memory capacity with %u KB of local mem per WG, up to %u WGs\n", local_mem_size_kb, group_count);
printf("Groups,Local Mem Capacity,Latency\n");
for (int groups = 1; groups <= group_count; groups++) {
result = latency_test(context, command_queue,
local_mem_capacity_kernel,
256 * sizeKb,
(uint32_t)scale_iterations(sizeKb, chase_iterations),
true,
groups,
1,
1,
64,
NULL);
printf("%d,%d,%f\n", groups, groups* local_mem_size_kb, result);
}
clReleaseKernel(local_mem_capacity_kernel);
clReleaseProgram(program);
}
else if (testType == ConstantMemLatency)
{
cl_program prog = build_program(context, "constant_unrolled_latency_test.cl", NULL);
cl_kernel constant_kernel = clCreateKernel(prog, "constant_unrolled_latency_test", &ret);
if (saveprogram) write_program(prog, "constant_unrolled_latency_test");
cl_ulong max_constant_test_size = get_max_constant_buffer_size();
printf("\nSattolo, constant memory (up to %llu K), no-unroll:\n", max_constant_test_size / 1024);
for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
if (max_constant_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) {
printf("%d K would exceed device's max constant buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_constant_test_size / 1024);
break;
}
result = latency_test(context, command_queue, constant_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL);
printf("%d,%f\n", default_test_sizes[size_idx], result);
if (result == 0) {
printf("Something went wrong, not testing anything bigger.\n");
break;
}
}
clReleaseKernel(constant_kernel);
clReleaseProgram(program);
}
else if (testType == TexMemLatency)
{
cl_program prog = build_program(context, "tex_latency_test.cl", NULL);
cl_kernel tex_latency_kernel = clCreateKernel(prog, "tex_latency_test", &ret);
if (saveprogram) write_program(prog, "tex_latency_test");
cl_ulong max_tex_test_size = get_max_tex_buffer_size();
for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) {
if (default_test_sizes[size_idx] * 1024 > max_tex_test_size) {
printf("%d K would exceed device's texture buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_tex_test_size / 1024);
break;
}
result = tex_latency_test(context, command_queue, tex_latency_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations),
thread_count, local_size, wave);
printf("%d,%f\n", default_test_sizes[size_idx], result);
if (result == 0) {
printf("Something went wrong, not testing anything bigger.\n");
break;
}
}
clReleaseKernel(tex_latency_kernel);
clReleaseProgram(prog);
}
else if (testType == LocalMemLatency)
{
cl_program prog = build_program(context, "local_unrolled_latency_test.cl", NULL);
cl_kernel local_kernel = clCreateKernel(prog, "local_unrolled_latency_test", &ret);
if (saveprogram) write_program(prog, "local_unrolled_latency_test");
uint32_t elapsed_ms = 0, target_ms = 2000;
chase_iterations = 50000;
while (elapsed_ms < target_ms / 2) {
result = latency_test(context, command_queue, local_kernel, 1024, chase_iterations, false, thread_count, local_size, wave, stride, &elapsed_ms);
fprintf(stderr, "%u iterations, %u ms -> %f ns\n", chase_iterations, elapsed_ms, result);
chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms);
}
printf("Local mem latency: %f\n", result);
clReleaseKernel(local_kernel);
clReleaseProgram(prog);
}
else if (testType == GlobalMemBandwidth)
{
cl_program prog = build_program(context, "sum_bw_test.cl", NULL);
cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret);
if (saveprogram) write_program(prog, "sum_bw_test");
fprintf(stderr, "Using %u threads, %u local size, %u base iterations\n", thread_count, local_size, chase_iterations);
printf("\nMemory bandwidth (up to %llu K):\n", max_global_test_size / 1024);
if (!sizeKb) {
for (int size_idx = 0; size_idx < sizeof(default_bw_test_sizes) / sizeof(unsigned long long); size_idx++) {
uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;
if ((max_global_test_size / 1024) < testSizeKb) {
printf("%llu K would exceed device's max buffer size of %llu K, stopping here.\n", testSizeKb, max_global_test_size / 1024);
break;
}
result = bw_test(context,
command_queue,
bw_kernel, 256 * testSizeKb,
thread_count,
local_size,
skip,
scale_bw_iterations(chase_iterations, testSizeKb));
printf("%llu,%f\n", testSizeKb, result);
if (result == 0) {
printf("Something went wrong, not testing anything bigger.\n");
break;
}
}
}
else {
result = bw_test(context,
command_queue,
bw_kernel, 256 * sizeKb,
thread_count,
local_size,
skip,
scale_bw_iterations(chase_iterations, sizeKb));
printf("%lu,%f\n", sizeKb, result);
if (result == 0) {
printf("Something went wrong, not testing anything bigger.\n");
}
}
clReleaseKernel(bw_kernel);
clReleaseProgram(prog);
}
else if (testType == LocalMemBandwidth ||
testType == LocalMem64Bandwidth ||
testType == BufferBandwidth ||
testType == LoadStoreBandwidth ||
testType == TextureThroughput ||
testType == LocalMemFloat4Bandwidth ||
testType == MixedFloat4Bandwidth)
{
cl_program prog;
cl_kernel local_bw_kernel = NULL, local_64_bw_kernel = NULL, local_float4_bw_kernel = NULL, buffer_bw_kernel = NULL, tex_bw_kernel = NULL, loadstore_bw_kernel = NULL;
cl_kernel mixed_bw_kernel = NULL;
if (testType == LocalMemBandwidth)
{
prog = build_program(context, "local_bw_test.cl", NULL);
local_bw_kernel = clCreateKernel(prog, "local_bw_test", &ret);
if (saveprogram) write_program(prog, "local_bw_test");
}
else if (testType == LocalMem64Bandwidth) {
prog = build_program(context, "local_64_bw_test.cl", NULL);
local_64_bw_kernel = clCreateKernel(prog, "local_64_bw_test", &ret);
if (saveprogram) write_program(prog, "local_64_bw_test");
}
else if (testType == LocalMemFloat4Bandwidth) {
prog = build_program(context, "local_float4_bw_test.cl", NULL);
local_float4_bw_kernel = clCreateKernel(prog, "local_float4_bw_test", &ret);
if (saveprogram) write_program(prog, "local_float4_bw_test");
}
else if (testType == BufferBandwidth) {
prog = build_program(context, "buffer_bw_test.cl", NULL);
buffer_bw_kernel = clCreateKernel(prog, "buffer_bw_test", &ret);
if (saveprogram) write_program(prog, "buffer_bw_test");
}
else if (testType == LoadStoreBandwidth)
{
prog = build_program(context, "ldst_bw_test.cl", NULL);
loadstore_bw_kernel = clCreateKernel(prog, "ldst_bw_test", &ret);
if (saveprogram) write_program(prog, "ldst_bw_test");
}
else if (testType == MixedFloat4Bandwidth)
{
prog = build_program(context, "local_float4_bw_test.cl", NULL);
mixed_bw_kernel = clCreateKernel(prog, "mixed_float4_bw_test", NULL);
if (saveprogram) write_program(prog, "mixed_float4_bw_test");
}
else { // tex throughput
prog = build_program(context, "tex_bw_test.cl", NULL);
tex_bw_kernel = clCreateKernel(prog, "tex_bw_test", &ret);
if (saveprogram) write_program(prog, "tex_bw_test");
}
uint32_t thread_low = 1024, thread_high = 1048576*4;
if (!thread_count_set) thread_count = thread_low;
float max_bw = 0;
while (true) {
int64_t elapsed_ms = 0, target_ms = 1500;
if (!chase_iterations_set) chase_iterations = 500000;
while (elapsed_ms < target_ms / 2)
{
if (testType == LocalMemBandwidth) {
fprintf(stderr, "Testing local mem bw\n");
result = local_bw_test(context, command_queue, local_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == LocalMem64Bandwidth) {
fprintf(stderr, "Testing local mem bw with 64-bit loads\n");
result = local_64_bw_test(context, command_queue, local_64_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == LocalMemFloat4Bandwidth) {
fprintf(stderr, "Testing local mem bw with float4 loads\n");
result = local_bw_test(context, command_queue, local_float4_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == MixedFloat4Bandwidth) {
fprintf(stderr, "Testing mixed local/global bw with float4 loads\n");
result = local_bw_test(context, command_queue, mixed_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == BufferBandwidth)
{
fprintf(stderr, "Testing buffer bw\n");
result = buffer_bw_test(context, command_queue, buffer_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == LoadStoreBandwidth)
{
fprintf(stderr, "Testing global load bandwidth\n");
result = local_bw_test(context, command_queue, loadstore_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms);
}
else if (testType == TextureThroughput)
{
fprintf(stderr, "Testing texture throughput\n");
result = tex_bw_test(context,
command_queue,
tex_bw_kernel,
256, // width
256, // height
thread_count,
local_size,
0,
chase_iterations,
&elapsed_ms);
}
fprintf(stderr, "%u threads, %u local size, %u iterations ==> %f GB/s, elapsed time %lld ms\n",
thread_count, local_size, chase_iterations, result, elapsed_ms);
if (elapsed_ms < 25) chase_iterations *= 2;
else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));
if (result == 0)
{
fprintf(stderr, "Run failed\n");
break;
}
if (chase_iterations_set) break;
}
if (result > max_bw) max_bw = result;
if (thread_count_set) break;
thread_count *= 2;
if (thread_count > thread_high) break;
}
printf("Bandwidth: %f GB/s\n", max_bw);
}
else if (testType == LocalMemChaseBandwidth)
{
int thread_scan_done = 0;
uint32_t thread_low = 256, thread_high = 524288 * 4;
fprintf(stderr, "Testing local memory bandwidth using pointer chasing. Ensure wave size is set correctly with -wave\n");
if (!thread_count_set) thread_count = thread_low;
while (!thread_scan_done) {
// ignore chase iterations and auto manage it
int64_t elapsed_ms = 0, target_ms = 1500;
chase_iterations = 500000;
if (thread_count_set) thread_scan_done = 0;
else
{
thread_count *= 2;
if (thread_count > thread_high) break;
}
while (elapsed_ms < target_ms / 2)
{
result = local_chase_bw_test(context, command_queue, local_bw_chase_kernel, thread_count, local_size, chase_iterations, wave, &elapsed_ms);
fprintf(stderr, "%u threads, %u local size, %u wave, %u iterations ==> %f GB/s, elapsed time %lld ms\n",
thread_count, local_size, wave, chase_iterations, result, elapsed_ms);
if (elapsed_ms < 25) chase_iterations *= 2;
else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms));
if (result == 0)
{
fprintf(stderr, "Run failed\n");
break;
}
}
}
printf("Local memory bandwidth: %f GB/s\n", result);
}
else if (testType == MemBandwidthWorkgroupScaling)
{
cl_program prog = build_program(context, "sum_bw_test.cl", NULL);
cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret);
if (saveprogram) write_program(prog, "sum_bw_test");
uint32_t testSizeCount = sizeof(default_bw_test_sizes) / sizeof(unsigned long long);
cl_uint cuCount = forceCuCount ? forceCuCount : getCuCount();
fprintf(stderr, "Device has %u compute units\n", cuCount);
float* scalingResults = (float*)malloc(sizeof(float) * cuCount * testSizeCount);
for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
{
if (!sizeKb) {
for (int size_idx = 0; size_idx < testSizeCount; size_idx++)
{
uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024;
fprintf(stderr, "Testing size %llu KB, %u workgroups\n", testSizeKb, workgroupCount);
if ((max_global_test_size / 1024) < testSizeKb) {
printf("%llu K would exceed device's max buffer size of %llu K\n", testSizeKb, max_global_test_size / 1024);
scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = 0;
continue;
}
result = bw_test(context,
command_queue,
bw_kernel, 256 * testSizeKb,
local_size * workgroupCount,
local_size,
skip,
scale_bw_iterations(chase_iterations, testSizeKb));
scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = result;
fprintf(stderr, "%u workgroups, %llu KB = %f GB/s\n", workgroupCount, testSizeKb, result);
}
}
else {
fprintf(stderr, "Testing size %d KB, %u workgroups\n", sizeKb, workgroupCount);
result = bw_test(context,
command_queue,
bw_kernel, 256 * sizeKb,
local_size * workgroupCount,
local_size,
skip,
scale_bw_iterations(chase_iterations, sizeKb));
scalingResults[workgroupCount - 1] = result;
fprintf(stderr, "%u workgroups, %lu KB = %f GB/s\n", workgroupCount, sizeKb, result);
}
}
if (!sizeKb) {
for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
{
printf(",%u", workgroupCount);
}
printf("\n");
for (int size_idx = 0; size_idx < testSizeCount; size_idx++)
{
printf("%llu", default_bw_test_sizes[size_idx] / 1024);
for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++)
{
printf(",%f", scalingResults[(workgroupCount - 1) * testSizeCount + size_idx]);
}
printf("\n");
}
}
else {
printf("For %d KB:\n", sizeKb);
for (int workgroupIdx = 0; workgroupIdx < cuCount; workgroupIdx++)
{
printf("%d,%f\n", workgroupIdx + 1, scalingResults[workgroupIdx]);
}
printf("\n");
}
free(scalingResults);
clReleaseKernel(bw_kernel);
clReleaseProgram(prog);
}
else if (testType == CoreToCore)
{
c2c_atomic_latency_test(context, command_queue, c2c_atomic_latency_test_kernel, chase_iterations);
}
else if (testType == LinkBandwidth)
{
link_bw_test(context, command_queue, dummy_add_kernel, chase_iterations);
}
else if (testType == InstructionRate)
{
instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, forcefp16, forcefp64);
}
else if (testType == Divergence)
{
int current_wave = 1;
int max_wave = 512;
printf("Contiguous Thread Block Size,FP32 GOPs\n");
while (current_wave <= max_wave)
{
float gops = run_divergence_rate_test(context, command_queue, thread_count, local_size, current_wave, NULL);
printf("%d,%f\n", current_wave, gops);
current_wave *= 2;
}
}
else if (testType == Partition)
{
// function and its associated kernel serve two purposes
int pattern4[] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern4);
printf("Throughput: %f\n", result);
int patterns[] = { 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0 };
result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, patterns);
printf("Throughput: %f\n", result);
int pattern2[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern2);
printf("Throughput: %f\n", result);
int consec_pattern[] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0 };
result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, consec_pattern);
printf("Throughput: %f\n", result);
}
//printf("If you didn't run this through cmd, now you can copy the results. And press ctrl+c to close");
//scanf("\n");
// Clean up
cleanup:
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseProgram(program);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
}
///
/// Heuristic to make sure test runs for enough time but not too long
///
/// Region size
/// base iterations
/// scaled iterations
uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) {
return 10 * iterations / pow(size_kb, 1.0 / 4.0);
}
#define INT_EXEC_INPUT_SIZE 16
float int_exec_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations)
{
cl_int ret;
cl_int result = 0;
size_t global_item_size = 1;
size_t local_item_size = 1;
float latency;
uint32_t time_diff_ms;
uint32_t A[INT_EXEC_INPUT_SIZE];
for (int i = 0; i < INT_EXEC_INPUT_SIZE; i++) A[i] = i;
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), NULL, &ret);
cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result);
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL);
clFinish(command_queue);
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj);
clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations);
clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj);
start_timing();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret);
latency = 0;
goto cleanup;
}
clFinish(command_queue);
time_diff_ms = end_timing();
latency = 1e6 * (float)time_diff_ms / (float)(iterations * 12);
cleanup:
clFlush(command_queue);
clFinish(command_queue);
clReleaseMemObject(a_mem_obj);
clReleaseMemObject(result_obj);
return latency;
}
uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb)
{
if (size_kb < 4096) return base_iterations;
else return base_iterations / 2;
}
================================================
FILE: GpuMemLatency/opencltest.h
================================================
#pragma once
#ifndef opencltestheader
#define opencltestheader
#include
#include
#include
#include
#include
#include "../Common/timing.h"
#define false 0
#define true 1
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#ifndef __APPLE__
#include
#else
#include
#endif
#define MAX_SOURCE_SIZE (0x100000)
#define CACHELINE_SIZE 64
#define TARGET_TIME_MS 2000
#ifndef _MSC_VER
#define _strnicmp strncmp
#endif
extern cl_device_id selected_device_id;
extern cl_platform_id selected_platform_id;
extern cl_ulong max_global_test_size;
extern int saveprogram;
cl_context get_context_from_user(int platform_index, int device_index);
cl_program build_program(cl_context context, const char* fname, const char *params);
void write_program(cl_program program, const char *name);
uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms);
void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment);
cl_uint getCuCount();
size_t getMaxWorkgroupSize();
cl_ulong get_max_constant_buffer_size();
cl_ulong get_max_buffer_size();
cl_ulong get_max_tex_buffer_size();
cl_ulong get_max_2d_tex_width();
cl_ulong get_max_2d_tex_height();
float int_atomic_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations,
short local,
uint32_t *time_ms);
float int_atomic_add_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
size_t threads,
size_t localsize);
float latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t list_size,
uint32_t chase_iterations,
short uniform,
int threads,
int local_size,
int wave,
int stride,
uint32_t *elapsed_ms);
float tex_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t list_size,
uint32_t chase_iterations,
int threads,
int local_size,
int wave_size);
float bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint64_t list_size,
uint32_t thread_count,
uint32_t local_size,
uint32_t skip,
uint32_t chase_iterations);
float tex_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint64_t width,
uint64_t height,
uint32_t thread_count,
uint32_t local_size,
uint32_t randomize,
uint32_t chase_iterations,
int64_t *time_ms);
float local_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t *time_ms);
float local_chase_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
uint32_t wave_size,
int64_t* time_ms);
float local_64_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t* time_ms);
float buffer_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int64_t* time_ms);
void link_bw_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations);
float c2c_atomic_latency_test(cl_context context,
cl_command_queue command_queue,
cl_kernel kernel,
uint32_t iterations);
float instruction_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t chase_iterations,
int forcefp16,
int forcefp64);
float run_divergence_rate_test(cl_context context,
cl_command_queue command_queue,
uint32_t thread_count,
uint32_t local_size,
uint32_t wave,
int *pattern);
#endif
================================================
FILE: GpuMemLatency/opencltest.sln
================================================
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.30503.244
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencltest", "opencltest.vcxproj", "{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Debug|x86 = Debug|x86
Release|x64 = Release|x64
Release|x86 = Release|x86
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32
{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71}
EndGlobalSection
EndGlobal
================================================
FILE: GpuMemLatency/opencltest.vcxproj
================================================
Debug
Win32
Release
Win32
Debug
x64
Release
x64
16.0
Win32Proj
{fa51d7f4-f6e0-4cb5-9cdd-ad39a3519f78}
opencltest
10.0
Application
true
v143
Unicode
Application
false
v143
true
Unicode
Application
true
v143
Unicode
Application
false
v143
true
Unicode
true
false
true
false
Level3
true
WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
Level3
true
true
true
WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
Console
true
true
true
Level3
true
_CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)
Console
true
$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)
OpenCL.lib;%(AdditionalDependencies)
Level3
true
true
true
_CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)
true
$(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories)
Console
true
true
true
$(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories)
OpenCL.lib;%(AdditionalDependencies)
false
CppCode
false
Document
false
Document
false
Document
false
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
Document
================================================
FILE: GpuMemLatency/opencltest.vcxproj.filters
================================================
================================================
FILE: GpuMemLatency/texturetest.c
================================================
#include "opencltest.h"
================================================
FILE: InstructionRate/Makefile
================================================
include ../Common/arch_detect.mk
CFLAGS = -O3
all: $(TARGET)
amd64:
$(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS)
aarch64:
$(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)
riscv64:
$(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS)
termux:
clang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS)
amd64_fusion:
$(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS)
w64:
$(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS)
ci: amd64 amd64_fusion aarch64 riscv64 w64
clean:
rm -f *.o && find . -type f -executable -delete
.PHONY: all ci clean
================================================
FILE: InstructionRate/arm_instructionrate.c
================================================
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
extern uint64_t noptest(uint64_t iterations);
extern uint64_t clktest(uint64_t iterations);
extern uint64_t addtest(uint64_t iterations);
extern uint64_t eortest(uint64_t iterations);
extern uint64_t maddaddtest(uint64_t iterations);
extern uint64_t cmptest(uint64_t iterations);
extern uint64_t addmultest(uint64_t iterations);
extern uint64_t addmul21test(uint64_t iterations);
extern uint64_t mul32test(uint64_t iterations);
extern uint64_t mul64test(uint64_t iterations);
extern uint64_t latmul64test(uint64_t iterations);
extern uint64_t jmptest(uint64_t iterations);
extern uint64_t fusejmptest(uint64_t iterations);
extern uint64_t mixmuljmptest(uint64_t iterations);
extern uint64_t mixmuljmptest21(uint64_t iterations);
extern uint64_t mixaddjmptest(uint64_t iterations);
extern uint64_t mixaddjmp21test(uint64_t iterations);
extern uint64_t rortest(uint64_t iterations);
extern uint64_t mixmulrortest(uint64_t iterations);
extern uint64_t vecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t latvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t vecmul128test(uint64_t iterations, int arr[4]);
extern uint64_t latvecmul128test(uint64_t iterations, int arr[4]);
extern uint64_t mixvecaddmul128test(uint64_t iterations, int arr[4]);
extern uint64_t faddtest(uint64_t iterations, float arr[4]);
extern uint64_t latfaddtest(uint64_t iterations, float arr[4]);
extern uint64_t vecfadd128test(uint64_t iterations, float arr[4]);
extern uint64_t vecfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t latvecfadd128test(uint64_t iterations, float arr[4]);
extern uint64_t latvecfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfaddfmul128test(uint64_t iterations, float arr[4]);
extern uint64_t vecfma128test(uint64_t iterations, float arr[4]);
extern uint64_t scalarfmatest(uint64_t iterations, float arr[4]);
extern uint64_t latvecfma128test(uint64_t iterations, float arr[4]);
extern uint64_t latscalarfmatest(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfaddfma128test(uint64_t iterations, float arr[4]);
extern uint64_t mixvecfmulfma128test(uint64_t iterations, float arr[4]);
// see if SIMD pipeline shares ports with scalar ALU ones
extern uint64_t mixaddvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mix3to1addvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mix1to1addvecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t mixmulvecmultest(uint64_t iterations, int arr[4]);
// are vec int and vec fp on the same port?
extern uint64_t mixvecmulfmultest(uint64_t iterations, float farr[4], int iarr[4]);
extern uint64_t mixvecaddfaddtest(uint64_t iterations, float farr[4], int iarr[4]);
// where are the branch ports
extern uint64_t mixjmpvecaddtest(uint64_t iterations, int arr[4]);
extern uint64_t mixjmpvecmultest(uint64_t iterations, int arr[4]);
// load/store
extern uint64_t loadtest(uint64_t iterations, int arr[4]);
extern uint64_t mixloadstoretest(uint64_t iterations, int arr[4], int sink[4]);
extern uint64_t mix21loadstoretest(uint64_t iterations, int arr[4], int sink[4]);
extern uint64_t vecloadtest(uint64_t iterations, int arr[4]);
extern uint64_t vecstoretest(uint64_t iterations, int arr[4], int sink[4]);
// renamer tests
extern uint64_t indepmovtest(uint64_t iterations);
extern uint64_t depmovtest(uint64_t iterations);
extern uint64_t xorzerotest(uint64_t iterations);
extern uint64_t movzerotest(uint64_t iterations);
extern uint64_t subzerotest(uint64_t iterations);
// Is crypto separate
extern uint64_t aesetest(uint64_t iterations, int arr[4]);
extern uint64_t mixaesevecadd128test(uint64_t iterations, int arr[4]);
extern uint64_t pmulltest(uint64_t iterations, int arr[4]);
extern uint64_t mixpmulladd128test(uint64_t iterations, int arr[4]);
float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };
int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };
int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };
float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t));
uint64_t vecadd128wrapper(uint64_t iterations);
uint64_t latvecadd128wrapper(uint64_t iterations);
uint64_t vecmul128wrapper(uint64_t iterations);
uint64_t latvecmul128wrapper(uint64_t iterations);
uint64_t mixvecaddmul128wrapper(uint64_t iterations);
uint64_t faddwrapper(uint64_t iterations);
uint64_t latfaddwrapper(uint64_t iterations);
uint64_t vecfadd128wrapper(uint64_t iterations);
uint64_t latvecfadd128wrapper(uint64_t iterations);
uint64_t vecfmul128wrapper(uint64_t iterations);
uint64_t latvecfmul128wrapper(uint64_t iterations);
uint64_t mixvecfaddfmul128wrapper(uint64_t iterations);
uint64_t mixaddvecadd128wrapper(uint64_t iterations);
uint64_t mix3to1addvecadd128wrapper(uint64_t iterations);
uint64_t mix1to1addvecadd128wrapper(uint64_t iterations);
uint64_t mixmulvecmulwrapper(uint64_t iterations);
uint64_t mixvecmulfmulwrapper(uint64_t iterations);
uint64_t mixvecaddfaddwrapper(uint64_t iterations);
uint64_t mixjmpvecaddwrapper(uint64_t iterations);
uint64_t mixjmpvecmulwrapper(uint64_t iterations);
uint64_t vecloadwrapper(uint64_t iterations);
uint64_t loadwrapper(uint64_t iterations);
uint64_t vecstorewrapper(uint64_t iterations);
uint64_t mixloadstorewrapper(uint64_t iterations);
uint64_t mix21loadstorewrapper(uint64_t iterations);
uint64_t vecfma128wrapper(uint64_t iterations);
uint64_t scalarfmawrapper(uint64_t iterations);
uint64_t latscalarfmawrapper(uint64_t iterations);
uint64_t mixvecfaddfma128wrapper(uint64_t iterations);
uint64_t mixvecfmulfma128wrapper(uint64_t iterations);
uint64_t latvecfma128wrapper(uint64_t iteration);
uint64_t aesetestwrapper(uint64_t iterations);
uint64_t mixaesevecadd128wrapper(uint64_t iterations);
uint64_t pmullwrapper(uint64_t iterations);
uint64_t mixpmulladd128wrapper(uint64_t iterations);
int threads = 0, hardaffinity = 0;
cpu_set_t cpuset;
int main(int argc, char *argv[]) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint64_t iterations = 1500000000;
uint64_t iterationsHigh = iterations * 5;
uint64_t time_diff_ms;
float latency, opsPerNs, clockSpeedGhz;
if (argc > 1) {
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char *arg = argv[argIdx] + 1;
if (strncmp(arg, "affinity", 8) == 0) {
argIdx++;
int targetCpu = atoi(argv[argIdx]);
CPU_ZERO(&cpuset);
CPU_SET(targetCpu, &cpuset);
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
fprintf(stderr, "Set affinity to %d\n", targetCpu);
}
else if (strncmp(arg, "hardaffinity", 12) == 0) {
CPU_ZERO(&cpuset);
CPU_SET(0, &cpuset);
CPU_SET(1, &cpuset);
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
fprintf(stderr, "Set affinity 2,3\n");
hardaffinity = 1;
}
else if (strncmp(arg, "threads", 7) == 0) {
argIdx++;
threads = atoi(argv[argIdx]);
fprintf(stderr, "Multithreading mode, %d threads\n", threads);
}
else if (strncmp(arg, "iter", 4) == 0) {
argIdx++;
int iterMul = atoi(argv[argIdx]);
iterations *= iterMul;
iterationsHigh *= iterMul;
fprintf(stderr, "Scaled iterations by %d\n", iterMul);
}
}
}
}
// figure out clock speed
gettimeofday(&startTv, &startTz);
clktest(iterations);
gettimeofday(&endTv, &endTz);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterations;
// clk speed should be 1/latency, assuming we got one add per clk, roughly
clockSpeedGhz = 1/latency;
printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz);
printf("Nops per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest));
printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest));
printf("XORs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, eortest));
printf("CMPs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmptest));
printf("\n----Renamer Tests----\n");
printf("Indepdent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest));
printf("Dependent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest));
printf("eor -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest));
printf("mov -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest));
printf("sub -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest));
printf("\n----ALU Pipe Layout Tests----\n");
printf("Not taken jmps per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest));
printf("Jump fusion test> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest));
printf("1:1 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest));
printf("1:2 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21));
printf("1:1 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest));
printf("1:2 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test));
printf("1:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest));
printf("2:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test));
printf("ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest));
printf("1:1 mixed mul/ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest));
printf("1:3 madd:add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest));
printf("32-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
printf("64-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test));
printf("64-bit multiply latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test));
printf("\n----FP/ASIMD Crypto Tests----\n");
printf("aese per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, aesetestwrapper));
printf("1:1 aese and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaesevecadd128wrapper));
printf("pmull per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, pmullwrapper));
printf("1:1 pmull and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixpmulladd128wrapper));
printf ("\n----FP/ASIMD Tests----\n");
printf("scalar fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper));
printf("128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper));
printf("128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper));
printf("128-bit vec int32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper));
printf("128-bit vec fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper));
printf("128-bit vec fp32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper));
printf("128-bit vec fp32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper));
printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper));
printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper));
printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper));
printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper));
printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper));
printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper));
printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper));
printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper));
printf("128-bit vec int32 add latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper));
printf("128-bit vec int32 mul latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper));
printf("Scalar FADD Latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper));
printf("128-bit vector FADD latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper));
printf("128-bit vector FMUL latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper));
printf("128-bit vector FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper));
printf("128-bit vector FMA latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper));
printf("Scalar FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper));
printf("Scalar FMA latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper));
printf("1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper));
printf("1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper));
printf("\n----Load/Store Tests----\n");
printf("128-bit vec loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper));
printf("128-bit vec stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper));
printf("64-bit loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper));
printf("1:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper));
printf("2:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper));
return 0;
}
struct TestThreadData {
uint64_t iterations;
uint64_t (*testfunc)(uint64_t);
};
void *TestThread(void *param) {
struct TestThreadData *testData = (struct TestThreadData *)param;
if (hardaffinity) {
sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset);
}
testData->testfunc(testData->iterations);
return NULL;
}
float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t)) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint64_t time_diff_ms;
float latency, opsPerNs;
gettimeofday(&startTv, &startTz);
if (threads == 0) testfunc(iterations);
else {
pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));
struct TestThreadData *testData = (struct TestThreadData *)malloc(threads * sizeof(struct TestThreadData));
for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
testData[threadIdx].iterations = iterations;
testData[threadIdx].testfunc = testfunc;
pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx);
}
for (int threadIdx = 0; threadIdx < threads; threadIdx++) {
pthread_join(testThreads[threadIdx], NULL);
}
free(testThreads);
free(testData);
}
gettimeofday(&endTv, &endTz);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterations;
opsPerNs = 1/latency;
//printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz);
return opsPerNs / clockSpeedGhz;
}
uint64_t vecadd128wrapper(uint64_t iterations) {
return vecadd128test(iterations, intTestArr);
}
uint64_t vecmul128wrapper(uint64_t iterations) {
return vecmul128test(iterations, intTestArr);
}
uint64_t latvecadd128wrapper(uint64_t iterations) {
return latvecadd128test(iterations, intTestArr);
}
uint64_t latvecmul128wrapper(uint64_t iterations) {
return latvecmul128test(iterations, intTestArr);
}
uint64_t mixvecaddmul128wrapper(uint64_t iterations) {
return mixvecaddmul128test(iterations, intTestArr);
}
uint64_t faddwrapper(uint64_t iterations) {
return faddtest(iterations, fpTestArr);
}
uint64_t latfaddwrapper(uint64_t iterations) {
return latfaddtest(iterations, fpTestArr);
}
uint64_t latvecfadd128wrapper(uint64_t iterations) {
return latvecfadd128test(iterations, fpTestArr);
}
uint64_t latvecfmul128wrapper(uint64_t iterations) {
return latvecfmul128test(iterations, fpTestArr);
}
uint64_t vecfadd128wrapper(uint64_t iterations) {
return vecfadd128test(iterations, fpTestArr);
}
uint64_t vecfmul128wrapper(uint64_t iterations) {
return vecfmul128test(iterations, fpTestArr);
}
uint64_t mixvecfaddfmul128wrapper(uint64_t iterations) {
return mixvecfaddfmul128test(iterations, fpTestArr);
}
uint64_t mixaddvecadd128wrapper(uint64_t iterations) {
return mixaddvecadd128test(iterations, intTestArr);
}
uint64_t mix3to1addvecadd128wrapper(uint64_t iterations) {
return mix3to1addvecadd128test(iterations, intTestArr);
}
uint64_t mix1to1addvecadd128wrapper(uint64_t iterations) {
return mix1to1addvecadd128test(iterations, intTestArr);
}
uint64_t mixmulvecmulwrapper(uint64_t iterations) {
return mixmulvecmultest(iterations, intTestArr);
}
uint64_t mixvecmulfmulwrapper(uint64_t iterations) {
return mixvecmulfmultest(iterations, fpTestArr, intTestArr);
}
uint64_t mixvecaddfaddwrapper(uint64_t iterations) {
return mixvecaddfaddtest(iterations, fpTestArr, intTestArr);
}
uint64_t mixjmpvecaddwrapper(uint64_t iterations) {
return mixjmpvecaddtest(iterations, intTestArr);
}
uint64_t mixjmpvecmulwrapper(uint64_t iterations) {
return mixjmpvecmultest(iterations, intTestArr);
}
uint64_t vecloadwrapper(uint64_t iterations) {
return vecloadtest(iterations, intTestArr);
}
uint64_t vecstorewrapper(uint64_t iterations) {
return vecstoretest(iterations, intTestArr, sinkArr);
}
uint64_t loadwrapper(uint64_t iterations) {
if (((uint64_t)intTestArr & 63) != 0) {
printf("Warning - load may not be 64B aligned\n");
}
return loadtest(iterations, intTestArr);
}
uint64_t mixloadstorewrapper(uint64_t iterations) {
return mixloadstoretest(iterations, intTestArr, sinkArr);
}
uint64_t mix21loadstorewrapper(uint64_t iterations) {
return mix21loadstoretest(iterations, intTestArr, sinkArr);
}
uint64_t vecfma128wrapper(uint64_t iterations) {
return vecfma128test(iterations, fpTestArr);
}
uint64_t scalarfmawrapper(uint64_t iterations) {
return scalarfmatest(iterations, fpTestArr);
}
uint64_t latscalarfmawrapper(uint64_t iterations) {
return latscalarfmatest(iterations, fpTestArr);
}
uint64_t latvecfma128wrapper(uint64_t iterations) {
return latvecfma128test(iterations, fpTestArr);
}
uint64_t mixvecfmulfma128wrapper(uint64_t iterations) {
return mixvecfmulfma128test(iterations, fpTestArr);
}
uint64_t mixvecfaddfma128wrapper(uint64_t iterations) {
return mixvecfaddfma128test(iterations, fpTestArr);
}
uint64_t aesetestwrapper(uint64_t iterations) {
return aesetest(iterations, intTestArr);
}
uint64_t mixaesevecadd128wrapper(uint64_t iterations) {
return mixaesevecadd128test(iterations, intTestArr);
}
uint64_t pmullwrapper(uint64_t iterations) {
return pmulltest(iterations, intTestArr);
}
uint64_t mixpmulladd128wrapper(uint64_t iterations) {
return mixpmulladd128test(iterations, intTestArr);
}
================================================
FILE: InstructionRate/arm_instructionrate.s
================================================
.text
.global clktest
.global addtest
.global eortest
.global maddaddtest
.global cmptest
.global addmultest
.global addmul21test
.global mixaddjmp21test
.global mul32test
.global mul64test
.global latmul64test
.global noptest
.global fusejmptest
.global jmptest
.global mixmuljmptest
.global mixmuljmptest21
.global mixaddjmptest
.global rortest
.global mixmulrortest
.global _clktest
.global _addtest
.global _eortest
.global _maddaddtest
.global _cmptest
.global _addmultest
.global _addmul21test
.global _mixaddjmp21test
.global _mul32test
.global _mul64test
.global _latmul64test
.global _noptest
.global _fusejmptest
.global _jmptest
.global _mixmuljmptest
.global _mixmuljmptest21
.global _mixaddjmptest
.global _rortest
.global _mixmulrortest
.global vecadd128test
.global latvecadd128test
.global vecmul128test
.global latvecmul128test
.global mixvecaddmul128test
.global faddtest
.global latfaddtest
.global latfmultest
.global latvecfadd128test
.global latvecfmul128test
.global vecfadd128test
.global vecfmul128test
.global mixvecfaddfmul128test
.global mixaddvecadd128test
.global mix3to1addvecadd128test
.global mix1to1addvecadd128test
.global mixmulvecmultest
.global mixvecmulfmultest
.global mixvecaddfaddtest
.global mixjmpvecaddtest
.global mixjmpvecmultest
.global vecfma128test
.global latvecfma128test
.global scalarfmatest
.global latscalarfmatest
.global aesetest
.global mixaesevecadd128test
.global pmulltest
.global mixpmulladd128test
.global _vecadd128test
.global _latvecadd128test
.global _vecmul128test
.global _latvecmul128test
.global _mixvecaddmul128test
.global _faddtest
.global _latfaddtest
.global _latfmultest
.global _latvecfadd128test
.global _latvecfmul128test
.global _vecfadd128test
.global _vecfmul128test
.global _mixvecfaddfmul128test
.global _mixaddvecadd128test
.global _mix3to1addvecadd128test
.global _mix1to1addvecadd128test
.global _mixmulvecmultest
.global _mixvecmulfmultest
.global _mixvecaddfaddtest
.global _mixjmpvecaddtest
.global _mixjmpvecmultest
.global _vecfma128test
.global _latvecfma128test
.global _scalarfmatest
.global _latscalarfmatest
.global mixvecfaddfma128test
.global mixvecfmulfma128test
.global loadtest
.global mixloadstoretest
.global mix21loadstoretest
.global vecloadtest
.global vecstoretest
.global _mixvecfaddfma128test
.global _mixvecfmulfma128test
.global _loadtest
.global _mixloadstoretest
.global _mix21loadstoretest
.global _vecloadtest
.global _vecstoretest
//renamer tests
.global indepmovtest
.global depmovtest
.global xorzerotest
.global movzerotest
.global subzerotest
.global _indepmovtest
.global _depmovtest
.global _xorzerotest
.global _movzerotest
.global _subzerotest
.global _aesetest
.global _mixaesevecadd128test
.global _pmulltest
.global _mixpmulladd128test
.balign 4
/* x0 = arg = iteration count. all iteration counts must be divisible by 10 */
_clktest:
clktest:
sub sp, sp, #0x30
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
mov x15, 1
mov x14, 20
eor x13, x13, x13
clktest_loop:
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
add x13, x13, x15
sub x0, x0, x14
cbnz x0, clktest_loop
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x30
ret
_noptest:
noptest:
sub sp, sp, #0x30
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
mov x15, 1
mov x14, 30
eor x13, x13, x13
noptest_loop:
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
sub x0, x0, x14
cbnz x0, noptest_loop
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x30
ret
_addtest:
addtest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 30
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
addtest_loop:
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add x9, x9, x15
sub x0, x0, x14
cbnz x0, addtest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_maddaddtest:
maddaddtest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 20
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
mov x10, 2
eor x9, x9, x9
mov x8, 3
maddaddtest_loop:
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
madd x10, x8, x0, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
madd x10, x8, x0, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
madd x10, x8, x0, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
madd x10, x8, x0, x15
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
madd x10, x8, x0, x15
sub x0, x0, x14
cbnz x0, maddaddtest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_eortest:
eortest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 30
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
eortest_loop:
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
eor x13, x13, x15
eor x12, x12, x15
eor x11, x11, x15
eor x10, x10, x15
eor x9, x9, x15
sub x0, x0, x14
cbnz x0, eortest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_cmptest:
cmptest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 30
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
cmptest_loop:
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
cmp x13, x13
cmp x12, x12
cmp x11, x11
cmp x10, x10
cmp x9, x9
sub x0, x0, x14
cbnz x0, cmptest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_addmultest:
addmultest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 20
mov x13, 2
eor x12, x12, x12
mov x11, 2
eor x10, x10, x10
mov x9, 2
mov x8, 2
addmultest_loop:
mul w13, w13, w15
add x12, x12, x15
mul w11, w11, w15
add x10, x10, x15
mul w9, w9, w15
add x12, x12, x15
mul w8, w8, w15
add x10, x10, x15
mul w13, w13, w15
add x12, x12, x15
mul w11, w11, w15
add x10, x10, x15
mul w9, w9, w15
add x12, x12, x15
mul w8, w8, w15
add x10, x10, x15
mul w13, w13, w15
add x12, x12, x15
mul w11, w11, w15
add x10, x10, x15
sub x0, x0, x14
cbnz x0, addmultest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_addmul21test:
addmul21test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 24
mov x13, 2
eor x12, x12, x12
mov x11, 2
eor x10, x10, x10
mov x9, 2
mov x8, 2
addmul21test_loop:
mul w13, w13, w15
add x12, x12, x15
add x10, x10, x15
mul w11, w11, w15
add x12, x12, x15
add x10, x10, x15
mul w9, w9, w15
add x12, x12, x15
add x10, x10, x15
mul w8, w8, w15
add x12, x12, x15
add x10, x10, x15
mul w13, w13, w15
add x12, x12, x15
add x10, x10, x15
mul w11, w11, w15
add x12, x12, x15
add x10, x10, x15
mul w9, w9, w15
add x12, x12, x15
add x10, x10, x15
mul w8, w8, w15
add x12, x12, x15
add x10, x10, x15
sub x0, x0, x14
cmp x0, 0
b.gt addmul21test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mul32test:
mul32test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 20
mov x13, x15
mov x12, x15
mov x11, x15
mov x10, x15
mov x9, x15
mov x8, x15
mul32test_loop:
mul w13, w13, w15
mul w12, w12, w15
mul w11, w11, w15
mul w10, w10, w15
mul w9, w9, w15
mul w8, w8, w15
mul w13, w13, w15
mul w12, w12, w15
mul w11, w11, w15
mul w10, w10, w15
mul w9, w9, w15
mul w8, w8, w15
mul w13, w13, w15
mul w12, w12, w15
mul w11, w11, w15
mul w10, w10, w15
mul w9, w9, w15
mul w8, w8, w15
mul w13, w13, w15
mul w12, w12, w15
sub x0, x0, x14
cbnz x0, mul32test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mul64test:
mul64test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 20
mov x13, x15
mov x12, x15
mov x11, x15
mov x10, x15
mov x9, x15
mov x8, x15
mul64test_loop:
mul x13, x13, x15
mul x12, x12, x15
mul x11, x11, x15
mul x10, x10, x15
mul x9, x9, x15
mul x8, x8, x15
mul x13, x13, x15
mul x12, x12, x15
mul x11, x11, x15
mul x10, x10, x15
mul x9, x9, x15
mul x8, x8, x15
mul x13, x13, x15
mul x12, x12, x15
mul x11, x11, x15
mul x10, x10, x15
mul x9, x9, x15
mul x8, x8, x15
mul x13, x13, x15
mul x12, x12, x15
sub x0, x0, x14
cbnz x0, mul64test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_latmul64test:
latmul64test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 20
mov x13, x15
latmul64test_loop:
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
mul x13, x13, x13
sub x0, x0, x14
cbnz x0, latmul64test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
/* needs an additional parameter passed in x1 - ptr to array of 4 floats */
_vecadd128test:
vecadd128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vecadd128test_loop:
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, vecadd128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latvecadd128test:
latvecadd128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latvecadd128test_loop:
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
add v16.4s, v16.4s, v16.4s
sub x0, x0, x14
cbnz x0, latvecadd128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_faddtest:
faddtest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr s16, [x1]
ldr s17, [x1, #0x4]
ldr s18, [x1, #0x8]
ldr s19, [x1, #0xC]
ldr s20, [x1]
ldr s21, [x1, #0x4]
faddtest_loop:
fadd s16, s16, s16
fadd s17, s17, s17
fadd s18, s18, s18
fadd s19, s19, s19
fadd s20, s20, s20
fadd s21, s21, s21
fadd s16, s16, s16
fadd s17, s17, s17
fadd s18, s18, s18
fadd s19, s19, s19
fadd s20, s20, s20
fadd s21, s21, s21
fadd s16, s16, s16
fadd s17, s17, s17
fadd s18, s18, s18
fadd s19, s19, s19
fadd s20, s20, s20
fadd s21, s21, s21
fadd s16, s16, s16
fadd s17, s17, s17
sub x0, x0, x14
cbnz x0, faddtest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latfaddtest:
latfaddtest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr s16, [x1]
latfaddtest_loop:
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
fadd s16, s16, s16
sub x0, x0, x14
cbnz x0, latfaddtest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latfmultest:
latfmultest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr s16, [x1]
latfmultest_loop:
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
fmul s16, s16, s16
sub x0, x0, x14
cbnz x0, latfmultest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latvecmul128test:
latvecmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latvecmul128test_loop:
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
mul v16.4s, v16.4s, v16.4s
sub x0, x0, x14
cbnz x0, latvecmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecmul128test:
vecmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vecmul128test_loop:
mul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
mul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
mul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
mul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, vecmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_scalarfmatest:
scalarfmatest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
ldr q24, [x1]
ldr q25, [x1]
scalarfmatest_loop:
fmadd s16, s16, s16, s16
fmadd s17, s17, s17, s17
fmadd s18, s18, s18, s18
fmadd s19, s19, s19, s19
fmadd s20, s20, s20, s20
fmadd s21, s21, s21, s21
fmadd s22, s22, s22, s22
fmadd s23, s23, s23, s23
fmadd s24, s24, s24, s24
fmadd s25, s25, s25, s25
fmadd s16, s16, s16, s16
fmadd s17, s17, s17, s17
fmadd s18, s18, s18, s18
fmadd s19, s19, s19, s19
fmadd s20, s20, s20, s20
fmadd s21, s21, s21, s21
fmadd s22, s22, s22, s22
fmadd s23, s23, s23, s23
fmadd s24, s24, s24, s24
fmadd s25, s25, s25, s25
sub x0, x0, x14
cbnz x0, scalarfmatest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latscalarfmatest:
latscalarfmatest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latscalarfmatest_loop:
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
fmadd s16, s16, s16, s16
sub x0, x0, x14
cbnz x0, latscalarfmatest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecfma128test:
vecfma128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
ldr q24, [x1]
ldr q25, [x1]
vecfma128test_loop:
fmla v16.4s, v16.4s, v16.4s
fmla v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fmla v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fmla v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fmla v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fmla v25.4s, v25.4s, v25.4s
fmla v16.4s, v16.4s, v16.4s
fmla v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fmla v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fmla v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fmla v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fmla v25.4s, v25.4s, v25.4s
sub x0, x0, x14
cbnz x0, vecfma128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixvecfmulfma128test:
mixvecfmulfma128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
ldr q24, [x1]
ldr q25, [x1]
mixvecfmulfma128test_loop:
fmla v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fmul v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fmul v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fmul v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fmul v25.4s, v25.4s, v25.4s
fmla v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fmul v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fmul v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fmul v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fmul v25.4s, v25.4s, v25.4s
sub x0, x0, x14
cbnz x0, mixvecfmulfma128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixvecfaddfma128test:
mixvecfaddfma128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
ldr q24, [x1]
ldr q25, [x1]
mixvecfaddfma128test_loop:
fmla v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fadd v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fadd v25.4s, v25.4s, v25.4s
fmla v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fmla v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fmla v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fmla v22.4s, v22.4s, v22.4s
fadd v23.4s, v23.4s, v23.4s
fmla v24.4s, v24.4s, v24.4s
fadd v25.4s, v25.4s, v25.4s
sub x0, x0, x14
cbnz x0, mixvecfaddfma128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latvecfma128test:
latvecfma128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latvecfma128test_loop:
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
fmla v16.4s, v16.4s, v16.4s
sub x0, x0, x14
cbnz x0, latvecfma128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecfadd128test:
vecfadd128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vecfadd128test_loop:
fadd v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, vecfadd128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latvecfadd128test:
latvecfadd128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latvecfadd128test_loop:
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
fadd v16.4s, v16.4s, v16.4s
sub x0, x0, x14
cbnz x0, latvecfadd128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecfmul128test:
vecfmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
vecfmul128test_loop:
fmul v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fmul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
fmul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fmul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
fmul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fmul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
fmul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
fmul v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, vecfmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_latvecfmul128test:
latvecfmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
latvecfmul128test_loop:
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
fmul v16.4s, v16.4s, v16.4s
sub x0, x0, x14
cbnz x0, latvecfmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixvecfaddfmul128test:
mixvecfaddfmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mixvecfaddfmul128test_loop:
fmul v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fmul v22.4s, v22.4s, v22.4s
fadd v23.4s, v23.4s, v23.4s
fmul v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
fadd v21.4s, v21.4s, v21.4s
fmul v22.4s, v22.4s, v22.4s
fadd v23.4s, v23.4s, v23.4s
fmul v16.4s, v16.4s, v16.4s
fadd v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
fadd v19.4s, v19.4s, v19.4s
sub x0, x0, x14
cbnz x0, mixvecfaddfmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixvecaddmul128test:
mixvecaddmul128test:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mixvecaddmul128test_loop:
mul v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
mul v22.4s, v22.4s, v22.4s
add v23.4s, v23.4s, v23.4s
mul v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
mul v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
mul v22.4s, v22.4s, v22.4s
add v23.4s, v23.4s, v23.4s
mul v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
mul v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
sub x0, x0, x14
cbnz x0, mixvecaddmul128test_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixaddvecadd128test:
mixaddvecadd128test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 30
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
eor x8, x8, x8
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mixaddvecadd128test_loop:
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add x10, x10, x15
add v22.4s, v22.4s, v22.4s
add v23.4s, v23.4s, v23.4s
sub x0, x0, x14
cbnz x0, mixaddvecadd128test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mix3to1addvecadd128test:
mix3to1addvecadd128test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 40
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
eor x8, x8, x8
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mix3to1addvecadd128test_loop:
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v16.4s, v16.4s, v16.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v17.4s, v17.4s, v17.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v20.4s, v20.4s, v20.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v16.4s, v16.4s, v16.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v17.4s, v17.4s, v17.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add x12, x12, x15
add x11, x11, x15
add v20.4s, v20.4s, v20.4s
sub x0, x0, x14
cbnz x0, mix3to1addvecadd128test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mix1to1addvecadd128test:
mix1to1addvecadd128test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 40
eor x13, x13, x13
eor x12, x12, x12
eor x11, x11, x11
eor x10, x10, x10
eor x9, x9, x9
eor x8, x8, x8
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mix1to1addvecadd128test_loop:
add x13, x13, x15
add v16.4s, v16.4s, v16.4s
add x12, x12, x15
add v17.4s, v17.4s, v17.4s
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x10, x10, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add v16.4s, v16.4s, v16.4s
add x12, x12, x15
add v17.4s, v17.4s, v17.4s
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x10, x10, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add v16.4s, v16.4s, v16.4s
add x12, x12, x15
add v17.4s, v17.4s, v17.4s
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x10, x10, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add v16.4s, v16.4s, v16.4s
add x12, x12, x15
add v17.4s, v17.4s, v17.4s
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x10, x10, x15
add v19.4s, v19.4s, v19.4s
add x13, x13, x15
add v16.4s, v16.4s, v16.4s
add x12, x12, x15
add v17.4s, v17.4s, v17.4s
add x11, x11, x15
add v18.4s, v18.4s, v18.4s
add x10, x10, x15
add v19.4s, v19.4s, v19.4s
sub x0, x0, x14
cbnz x0, mix1to1addvecadd128test_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixmulvecmultest:
mixmulvecmultest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 2
mov x14, 20
mov x13, x15
mov x12, x15
mov x11, x15
mov x10, x15
mov x9, x15
mov x8, x15
mov x7, x15
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
ldr q22, [x1]
ldr q23, [x1]
mixmulvecmultest_loop:
mul w8, w8, w15
mul v16.4s, v16.4s, v16.4s
mul w9, w9, w15
mul v17.4s, v17.4s, v17.4s
mul w10, w10, w15
mul v18.4s, v18.4s, v18.4s
mul w11, w11, w15
mul v19.4s, v19.4s, v19.4s
mul w12, w12, w15
mul v20.4s, v20.4s, v20.4s
mul w8, w8, w15
mul v16.4s, v16.4s, v16.4s
mul w9, w9, w15
mul v17.4s, v17.4s, v17.4s
mul w10, w10, w15
mul v18.4s, v18.4s, v18.4s
mul w11, w11, w15
mul v19.4s, v19.4s, v19.4s
mul w12, w12, w15
mul v20.4s, v20.4s, v20.4s
sub x0, x0, x14
cbnz x0, mixmulvecmultest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixvecmulfmultest:
mixvecmulfmultest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x2]
ldr q18, [x1]
ldr q19, [x2]
ldr q20, [x1]
ldr q21, [x2]
mixvecmulfmultest_loop:
fmul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
fmul v18.4s, v18.4s, v18.4s
mul v19.4s, v19.4s, v19.4s
fmul v20.4s, v20.4s, v20.4s
mul v21.4s, v21.4s, v21.4s
fmul v16.4s, v16.4s, v16.4s
mul v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, mixvecmulfmultest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixvecaddfaddtest:
mixvecaddfaddtest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x2]
ldr q18, [x1]
ldr q19, [x2]
ldr q20, [x1]
ldr q21, [x2]
mixvecaddfaddtest_loop:
fadd v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
fadd v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
fadd v20.4s, v20.4s, v20.4s
add v21.4s, v21.4s, v21.4s
fadd v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
sub x0, x0, x14
cbnz x0, mixvecaddfaddtest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixjmpvecaddtest:
mixjmpvecaddtest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 30
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
mixjmpvecaddtest_loop:
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v20.4s, v20.4s, v20.4s
add v16.4s, v16.4s, v16.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v16.4s, v16.4s, v16.4s
add v17.4s, v17.4s, v17.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v18.4s, v18.4s, v18.4s
add v19.4s, v19.4s, v19.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v20.4s, v20.4s, v20.4s
add v16.4s, v16.4s, v16.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v17.4s, v17.4s, v17.4s
add v18.4s, v18.4s, v18.4s
cbz x0, mixjmpvecaddtest_jellydonut
add v19.4s, v19.4s, v19.4s
add v20.4s, v20.4s, v20.4s
cbz x0, mixjmpvecaddtest_jellydonut
sub x0, x0, x14
cbnz x0, mixjmpvecaddtest_loop
mixjmpvecaddtest_jellydonut:
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixjmpvecmultest:
mixjmpvecmultest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
mixjmpvecmultest_loop:
mul v16.4s, v16.4s, v16.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v17.4s, v17.4s, v17.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v18.4s, v18.4s, v18.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v19.4s, v19.4s, v19.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v20.4s, v20.4s, v20.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v16.4s, v16.4s, v16.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v17.4s, v17.4s, v17.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v18.4s, v18.4s, v18.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v19.4s, v19.4s, v19.4s
cbz x0, mixjmpvecmultest_jellydonut
mul v20.4s, v20.4s, v20.4s
cbz x0, mixjmpvecmultest_jellydonut
sub x0, x0, x14
cbnz x0, mixjmpvecmultest_loop
mixjmpvecmultest_jellydonut:
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecloadtest:
vecloadtest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
vecloadtest_loop:
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
sub x0, x0, x14
cbnz x0, vecloadtest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_vecstoretest:
vecstoretest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
vecstoretest_loop:
str q16, [x2]
str q17, [x2]
str q18, [x2]
str q19, [x2]
str q20, [x2]
str q16, [x2]
str q17, [x2]
str q18, [x2]
str q19, [x2]
str q20, [x2]
str q16, [x2]
str q17, [x2]
str q18, [x2]
str q19, [x2]
str q20, [x2]
str q16, [x2]
str q17, [x2]
str q18, [x2]
str q19, [x2]
str q20, [x2]
sub x0, x0, x14
cbnz x0, vecstoretest_loop
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_loadtest:
loadtest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x14, 20
loadtest_loop:
ldr x10, [x1]
ldr x11, [x1]
ldr x12, [x1]
ldr x13, [x1]
ldr x15, [x1]
ldr x10, [x1]
ldr x11, [x1]
ldr x12, [x1]
ldr x13, [x1]
ldr x15, [x1]
ldr x10, [x1]
ldr x11, [x1]
ldr x12, [x1]
ldr x13, [x1]
ldr x15, [x1]
ldr x10, [x1]
ldr x11, [x1]
ldr x12, [x1]
ldr x13, [x1]
ldr x15, [x1]
sub x0, x0, x14
cbnz x0, loadtest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixloadstoretest:
mixloadstoretest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x14, 20
mixloadstoretest_loop:
ldr x10, [x1]
str x14, [x2]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
str x14, [x2]
ldr x13, [x1]
str x14, [x2]
ldr x15, [x1]
str x14, [x2]
ldr x10, [x1]
str x14, [x2]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
str x14, [x2]
ldr x13, [x1]
str x14, [x2]
ldr x15, [x1]
str x14, [x2]
sub x0, x0, x14
cbnz x0, mixloadstoretest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mix21loadstoretest:
mix21loadstoretest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x14, 30
mix21loadstoretest_loop:
ldr x10, [x1]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
ldr x13, [x1]
str x14, [x2]
ldr x10, [x1]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
ldr x13, [x1]
str x14, [x2]
ldr x10, [x1]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
ldr x13, [x1]
str x14, [x2]
ldr x10, [x1]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
ldr x13, [x1]
str x14, [x2]
ldr x10, [x1]
ldr x11, [x1]
str x14, [x2]
ldr x12, [x1]
ldr x13, [x1]
str x14, [x2]
sub x0, x0, x14
cbnz x0, mix21loadstoretest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_jmptest:
jmptest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
jmptest_loop:
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
cbz x0, jmptest_jellydonut
sub x0, x0, x14
cbnz x0, jmptest_loop
jmptest_jellydonut:
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_fusejmptest:
fusejmptest:
sub sp, sp, #0x20
stp x14, x15, [sp, #0x10]
mov x14, 20
fusejmptest_loop:
nop
nop
cmp x0, 0
b.eq jmptest_jellydonut
nop
nop
cmp x0, 0
b.eq jmptest_jellydonut
nop
nop
cmp x0, 0
b.eq jmptest_jellydonut
nop
nop
cmp x0, 0
b.eq jmptest_jellydonut
nop
sub x0, x0, x14
cmp x0, 0
b.ne fusejmptest_loop
fusejmptest_jellydonut:
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x20
ret
_mixmuljmptest:
mixmuljmptest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x14, 20
mixmuljmptest_loop:
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
mul x9, x9, x15
mul x8, x8, x15
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
sub x0, x0, x14
cbnz x0, mixmuljmptest_loop
mixmuljmptest_jellydonut:
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixmuljmptest21:
mixmuljmptest21:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x14, 30
mixmuljmptest21_loop:
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
mul x9, x9, x15
mul x8, x8, x15
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
cbz x0, mixmuljmptest21_jellydonut
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
mul x9, x9, x15
mul x8, x8, x15
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
sub x0, x0, x14
cbnz x0, mixmuljmptest21_loop
mixmuljmptest21_jellydonut:
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixaddjmptest:
mixaddjmptest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x14, 20
mixaddjmptest_loop:
add x10, x10, x15
add x11, x11, x15
add x12, x12, x15
add x13, x13, x15
add x9, x9, x15
add x8, x8, x15
add x10, x10, x15
add x11, x11, x15
add x12, x12, x15
add x13, x13, x15
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixaddjmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
cbz x0, mixmuljmptest_jellydonut
sub x0, x0, x14
cbnz x0, mixmuljmptest_loop
mixaddjmptest_jellydonut:
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixaddjmp21test:
mixaddjmp21test:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x14, 15
mixaddjmp21test_loop:
add x10, x10, x15
add x11, x11, x15
cbz x0, mixaddjmp21test_jellydonut
add x12, x12, x15
add x13, x13, x15
cbz x0, mixaddjmp21test_jellydonut
add x9, x9, x15
add x8, x8, x15
cbz x0, mixaddjmp21test_jellydonut
add x10, x10, x15
add x11, x11, x15
cbz x0, mixaddjmp21test_jellydonut
add x12, x12, x15
add x13, x13, x15
cbz x0, mixaddjmp21test_jellydonut
sub x0, x0, x14
cmp x0, 0
b.gt mixaddjmp21test_loop
mixaddjmp21test_jellydonut:
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_mixmulrortest:
mixmulrortest:
sub sp, sp, #0x80
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
stp x19, x20, [sp, #0x50]
stp x21, x22, [sp, #0x60]
stp x23, x24, [sp, #0x70]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x19, x8
mov x20, x8
mov x21, x8
mov x22, x8
mov x23, x8
mov x24, x8
mov x14, 20
mixmulrortest_loop:
ror x24, x24, 1
ror x23, x23, 1
ror x22, x22, 1
ror x21, x21, 1
ror x20, x20, 1
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
mul x9, x9, x15
ror x24, x24, 1
ror x23, x23, 1
ror x22, x22, 1
ror x21, x21, 1
ror x20, x20, 1
mul x8, x8, x15
mul x10, x10, x15
mul x11, x11, x15
mul x12, x12, x15
mul x13, x13, x15
sub x0, x0, x14
cbnz x0, mixmulrortest_loop
ldp x23, x24, [sp, #0x70]
ldp x21, x22, [sp, #0x60]
ldp x19, x20, [sp, #0x50]
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x80
ret
_rortest:
rortest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x8, 7
mov x9, 6
mov x10, 1
mov x11, 2
mov x12, 3
mov x13, 4
mov x15, 5
mov x14, 20
rortest_loop:
ror x10, x10, 1
ror x11, x11, 1
ror x12, x12, 1
ror x13, x13, 1
ror x9, x9, 1
ror x8, x8, 1
ror x10, x10, 1
ror x11, x11, 1
ror x12, x12, 1
ror x13, x13, 1
ror x10, x10, 1
ror x11, x11, 1
ror x12, x12, 1
ror x13, x13, 1
ror x9, x9, 1
ror x8, x8, 1
ror x10, x10, 1
ror x11, x11, 1
ror x12, x12, 1
ror x13, x13, 1
sub x0, x0, x14
cbnz x0, rortest_loop
rortest_jellydonut:
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_depmovtest:
depmovtest:
sub sp, sp, #0x40
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
mov x15, 1
mov x14, 20
eor x13, x13, x13
depmovtest_loop:
mov x12, x15
mov x10, x12
mov x13, x10
mov x11, x13
mov x15, x11
mov x12, x15
mov x10, x12
mov x13, x10
mov x11, x13
mov x15, x11
mov x12, x15
mov x10, x12
mov x13, x10
mov x11, x13
mov x15, x11
mov x12, x15
mov x10, x12
mov x13, x10
mov x11, x13
mov x15, x11
sub x0, x0, x14
cbnz x0, depmovtest_loop
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x40
ret
_indepmovtest:
indepmovtest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 20
eor x13, x13, x13
indepmovtest_loop:
mov x10, x15
mov x11, x14
mov x12, x13
mov x9, x15
mov x8, x14
mov x10, x15
mov x11, x14
mov x12, x13
mov x9, x15
mov x8, x14
mov x10, x15
mov x11, x14
mov x12, x13
mov x9, x15
mov x8, x14
mov x10, x15
mov x11, x14
mov x12, x13
mov x9, x15
mov x8, x14
sub x0, x0, x14
cbnz x0, indepmovtest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_xorzerotest:
xorzerotest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 20
xorzerotest_loop:
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
eor x15, x15, x15
sub x0, x0, x14
cbnz x0, xorzerotest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_movzerotest:
movzerotest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 20
movzerotest_loop:
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
mov x15, 0
sub x0, x0, x14
cbnz x0, movzerotest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_subzerotest:
subzerotest:
sub sp, sp, #0x50
stp x14, x15, [sp, #0x10]
stp x12, x13, [sp, #0x20]
stp x10, x11, [sp, #0x30]
stp x8, x9, [sp, #0x40]
mov x15, 1
mov x14, 20
subzerotest_loop:
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x15, x15, x15
sub x0, x0, x14
cbnz x0, subzerotest_loop
ldp x8, x9, [sp, #0x40]
ldp x10, x11, [sp, #0x30]
ldp x12, x13, [sp, #0x20]
ldp x14, x15, [sp, #0x10]
add sp, sp, #0x50
ret
_aesetest:
aesetest:
sub sp, sp, #0x50
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
mov x14, 20
aesetest_loop:
aese v0.16b, v16.16b
aese v1.16b, v17.16b
aese v2.16b, v18.16b
aese v3.16b, v19.16b
aese v4.16b, v20.16b
aese v0.16b, v16.16b
aese v1.16b, v17.16b
aese v2.16b, v18.16b
aese v3.16b, v19.16b
aese v4.16b, v20.16b
aese v0.16b, v16.16b
aese v1.16b, v17.16b
aese v2.16b, v18.16b
aese v3.16b, v19.16b
aese v4.16b, v20.16b
aese v0.16b, v16.16b
aese v1.16b, v17.16b
aese v2.16b, v18.16b
aese v3.16b, v19.16b
aese v4.16b, v20.16b
sub x0, x0, x14
cbnz x0, aesetest_loop
add sp, sp, #0x50
ret
_mixaesevecadd128test:
mixaesevecadd128test:
sub sp, sp, #0x50
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
mov x14, 20
mixaesevecadd128test_loop:
aese v0.16b, v16.16b
add v5.4s, v9.4s, v16.4s
aese v1.16b, v17.16b
add v6.4s, v10.4s, v16.4s
aese v2.16b, v18.16b
add v7.4s, v11.4s, v16.4s
aese v3.16b, v19.16b
add v31.4s, v12.4s, v16.4s
aese v4.16b, v20.16b
add v30.4s, v13.4s, v16.4s
aese v0.16b, v16.16b
add v5.4s, v9.4s, v16.4s
aese v1.16b, v17.16b
add v6.4s, v10.4s, v16.4s
aese v2.16b, v18.16b
add v7.4s, v11.4s, v16.4s
aese v3.16b, v19.16b
add v31.4s, v12.4s, v16.4s
aese v4.16b, v20.16b
add v30.4s, v13.4s, v16.4s
sub x0, x0, x14
cbnz x0, mixaesevecadd128test_loop
add sp, sp, #0x50
ret
_pmulltest:
pmulltest:
sub sp, sp, #0x50
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
mov x14, 20
pmulltest_loop:
pmull v0.1q, v16.1d, v17.1d
pmull v1.1q, v16.1d, v17.1d
pmull v2.1q, v16.1d, v17.1d
pmull v3.1q, v16.1d, v17.1d
pmull v4.1q, v16.1d, v17.1d
pmull v0.1q, v16.1d, v17.1d
pmull v1.1q, v16.1d, v17.1d
pmull v2.1q, v16.1d, v17.1d
pmull v3.1q, v16.1d, v17.1d
pmull v4.1q, v16.1d, v17.1d
pmull v0.1q, v16.1d, v17.1d
pmull v1.1q, v16.1d, v17.1d
pmull v2.1q, v16.1d, v17.1d
pmull v3.1q, v16.1d, v17.1d
pmull v4.1q, v16.1d, v17.1d
pmull v0.1q, v16.1d, v17.1d
pmull v1.1q, v16.1d, v17.1d
pmull v2.1q, v16.1d, v17.1d
pmull v3.1q, v16.1d, v17.1d
pmull v4.1q, v16.1d, v17.1d
sub x0, x0, x14
cbnz x0, pmulltest_loop
add sp, sp, #0x50
ret
_mixpmulladd128test:
mixpmulladd128test:
sub sp, sp, #0x50
ldr q16, [x1]
ldr q17, [x1]
ldr q18, [x1]
ldr q19, [x1]
ldr q20, [x1]
ldr q21, [x1]
mov x14, 20
mixpmulladd128test_loop:
pmull v0.1q, v16.1d, v17.1d
add v5.4s, v9.4s, v16.4s
pmull v1.1q, v16.1d, v17.1d
add v6.4s, v9.4s, v16.4s
pmull v2.1q, v16.1d, v17.1d
add v7.4s, v9.4s, v16.4s
pmull v3.1q, v16.1d, v17.1d
add v31.4s, v9.4s, v16.4s
pmull v4.1q, v16.1d, v17.1d
add v30.4s, v9.4s, v16.4s
pmull v0.1q, v16.1d, v17.1d
add v5.4s, v9.4s, v16.4s
pmull v1.1q, v16.1d, v17.1d
add v6.4s, v9.4s, v16.4s
pmull v2.1q, v16.1d, v17.1d
add v7.4s, v9.4s, v16.4s
pmull v3.1q, v16.1d, v17.1d
add v31.4s, v9.4s, v16.4s
pmull v4.1q, v16.1d, v17.1d
add v30.4s, v9.4s, v16.4s
sub x0, x0, x14
cbnz x0, mixpmulladd128test_loop
add sp, sp, #0x50
ret
================================================
FILE: InstructionRate/riscv_instructionrate.c
================================================
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
#include
#include
float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *));
extern uint64_t clktest(uint64_t iterations, void *data);
extern uint64_t addtest(uint64_t iterations, void *data);
extern uint64_t faddtest(uint64_t iterations, void *data);
extern uint64_t fmultest(uint64_t iterations, void *data);
extern uint64_t mixfaddfmultest(uint64_t iterations, void *data);
extern uint64_t fmatest(uint64_t iterations, void *data);
extern uint64_t faddlattest(uint64_t iterations, void *data);
extern uint64_t fmullattest(uint64_t iterations, void *data);
extern uint64_t fmalattest(uint64_t iterations, void *data);
float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 };
int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 };
int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 };
int main(int argc, char *argv[]) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint64_t iterations = 1500000000;
uint64_t iterationsHigh = iterations * 5;
uint64_t time_diff_ms;
float latency, opsPerNs, clockSpeedGhz;
if (argc > 1) {
for (int argIdx = 1; argIdx < argc; argIdx++) {
if (*(argv[argIdx]) == '-') {
char *arg = argv[argIdx] + 1;
if (strncmp(arg, "iter", 4) == 0) {
argIdx++;
int iterMul = atoi(argv[argIdx]);
iterations *= iterMul;
iterationsHigh *= iterMul;
fprintf(stderr, "Scaled iterations by %d\n", iterMul);
}
}
}
}
gettimeofday(&startTv, &startTz);
clktest(iterations, NULL);
gettimeofday(&endTv, &endTz);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterations;
// clk speed should be 1/latency, assuming we got one add per clk, roughly
clockSpeedGhz = 1/latency;
printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz);
// integer side
printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, NULL, addtest));
// FP
printf("FP32 Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, faddtest));
printf("FP32 Add latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, faddlattest));
printf("FP32 Multiplies per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmultest));
printf("FP32 Multiply latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmullattest));
printf("1:1 FP32 Add:Mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, mixfaddfmultest));
printf("FP32 FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmatest));
printf("FP32 FMA latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmalattest));
return 0;
}
float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *)) {
struct timeval startTv, endTv;
struct timezone startTz, endTz;
uint64_t time_diff_ms, retval;
float latency, opsPerNs;
gettimeofday(&startTv, &startTz);
retval = testfunc(iterations, arr);
gettimeofday(&endTv, &endTz);
time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);
latency = 1e6 * (float)time_diff_ms / (float)iterations;
opsPerNs = 1/latency;
//printf("return value: %lu\n", retval);
return opsPerNs / clockSpeedGhz;
}
================================================
FILE: InstructionRate/riscv_instructionrate.s
================================================
.text
.global clktest
.global addtest
.global faddtest
.global fmultest
.global mixfaddfmultest
.global fmatest
.global faddlattest
.global fmullattest
.global fmalattest
/* a0 = iterations, a1 = data arr */
clktest:
mv t0, x0
mv t1, x0
addi t1, t1, 1
clktest_loop:
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
add t0, t0, t1
addi a0, a0, -20
blt x0, a0, clktest_loop
ret
addtest:
mv t0, x0
addi t0, t0, 1
mv t1, t0
mv t2, t0
mv t3, t0
mv t4, t0
mv t5, t0
mv t6, t0
addtest_loop:
add t1, t1, t6
add t2, t2, t6
add t3, t3, t6
add t4, t4, t6
add t5, t5, t6
add t1, t1, t6
add t2, t2, t6
add t3, t3, t6
add t4, t4, t6
add t5, t5, t6
add t1, t1, t6
add t2, t2, t6
add t3, t3, t6
add t4, t4, t6
add t5, t5, t6
add t1, t1, t6
add t2, t2, t6
add t3, t3, t6
add t4, t4, t6
add t5, t5, t6
addi a0, a0, -20
blt x0, a0, addtest_loop
ret
/* f0-7 are fp temporaries */
faddtest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
faddtest_loop:
fadd.d f1, f1, f0
fadd.d f2, f2, f0
fadd.d f3, f3, f0
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
fadd.d f1, f1, f0
fadd.d f2, f2, f0
fadd.d f3, f3, f0
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
addi a0, a0, -14
blt x0, a0, faddtest_loop
ret
faddlattest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
faddlattest_loop:
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
fadd.d f1, f1, f1
addi a0, a0, -14
blt x0, a0, faddlattest_loop
ret
fmultest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
fmultest_loop:
fmul.d f1, f1, f0
fmul.d f2, f2, f0
fmul.d f3, f3, f0
fmul.d f4, f4, f0
fmul.d f5, f5, f0
fmul.d f6, f6, f0
fmul.d f7, f7, f0
fmul.d f1, f1, f0
fmul.d f2, f2, f0
fmul.d f3, f3, f0
fmul.d f4, f4, f0
fmul.d f5, f5, f0
fmul.d f6, f6, f0
fmul.d f7, f7, f0
addi a0, a0, -14
blt x0, a0, fmultest_loop
ret
fmullattest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
fmullattest_loop:
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
fmul.d f1, f1, f1
addi a0, a0, -14
blt x0, a0, fmullattest_loop
ret
mixfaddfmultest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
mixfaddfmultest_loop:
fadd.d f1, f1, f0
fmul.d f2, f2, f0
fadd.d f3, f3, f0
fmul.d f4, f4, f0
fadd.d f5, f5, f0
fmul.d f6, f6, f0
fadd.d f7, f7, f0
fmul.d f1, f1, f0
fadd.d f2, f2, f0
fmul.d f3, f3, f0
fadd.d f4, f4, f0
fmul.d f5, f5, f0
fadd.d f6, f6, f0
fmul.d f7, f7, f0
addi a0, a0, -14
blt x0, a0, mixfaddfmultest_loop
ret
fmatest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
fmatest_loop:
fmadd.d f1, f1, f1, f0
fmadd.d f2, f2, f2, f0
fmadd.d f3, f3, f3, f0
fmadd.d f4, f4, f4, f0
fmadd.d f5, f5, f5, f0
fmadd.d f6, f6, f6, f0
fmadd.d f7, f7, f7, f0
fmadd.d f1, f1, f1, f0
fmadd.d f2, f2, f2, f0
fmadd.d f3, f3, f3, f0
fmadd.d f4, f4, f4, f0
fmadd.d f5, f5, f5, f0
fmadd.d f6, f6, f6, f0
fmadd.d f7, f7, f7, f0
addi a0, a0, -14
blt x0, a0, fmatest_loop
ret
fmalattest:
flw f0, (a1)
flw f1, 4(a1)
flw f2, 8(a1)
flw f3, 12(a1)
fsub.d f4, f4, f4
fsub.d f5, f5, f5
fsub.d f6, f6, f6
fsub.d f7, f7, f7
fadd.d f4, f4, f0
fadd.d f5, f5, f0
fadd.d f6, f6, f0
fadd.d f7, f7, f0
fmalattest_loop:
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
fmadd.d f1, f1, f1, f0
addi a0, a0, -14
blt x0, a0, fmalattest_loop
ret
================================================
FILE: InstructionRate/test.s
================================================
x86_instructionrate: file format elf64-x86-64
Disassembly of section .init:
0000000000001000 <_init>:
1000: f3 0f 1e fa endbr64
1004: 48 83 ec 08 sub $0x8,%rsp
1008: 48 8b 05 d9 cf 00 00 mov 0xcfd9(%rip),%rax # dfe8 <__gmon_start__>
100f: 48 85 c0 test %rax,%rax
1012: 74 02 je 1016 <_init+0x16>
1014: ff d0 callq *%rax
1016: 48 83 c4 08 add $0x8,%rsp
101a: c3 retq
Disassembly of section .plt:
0000000000001020 <.plt>:
1020: ff 35 62 cf 00 00 pushq 0xcf62(%rip) # df88 <_GLOBAL_OFFSET_TABLE_+0x8>
1026: ff 25 64 cf 00 00 jmpq *0xcf64(%rip) # df90 <_GLOBAL_OFFSET_TABLE_+0x10>
102c: 0f 1f 40 00 nopl 0x0(%rax)
0000000000001030 :
1030: ff 25 62 cf 00 00 jmpq *0xcf62(%rip) # df98
1036: 68 00 00 00 00 pushq $0x0
103b: e9 e0 ff ff ff jmpq 1020 <.plt>
0000000000001040 <__stack_chk_fail@plt>:
1040: ff 25 5a cf 00 00 jmpq *0xcf5a(%rip) # dfa0 <__stack_chk_fail@GLIBC_2.4>
1046: 68 01 00 00 00 pushq $0x1
104b: e9 d0 ff ff ff jmpq 1020 <.plt>
0000000000001050 :
1050: ff 25 52 cf 00 00 jmpq *0xcf52(%rip) # dfa8
1056: 68 02 00 00 00 pushq $0x2
105b: e9 c0 ff ff ff jmpq 1020 <.plt>
0000000000001060 :
1060: ff 25 4a cf 00 00 jmpq *0xcf4a(%rip) # dfb0
1066: 68 03 00 00 00 pushq $0x3
106b: e9 b0 ff ff ff jmpq 1020 <.plt>
0000000000001070 :
1070: ff 25 42 cf 00 00 jmpq *0xcf42(%rip) # dfb8
1076: 68 04 00 00 00 pushq $0x4
107b: e9 a0 ff ff ff jmpq 1020 <.plt>
0000000000001080 <__printf_chk@plt>:
1080: ff 25 3a cf 00 00 jmpq *0xcf3a(%rip) # dfc0 <__printf_chk@GLIBC_2.3.4>
1086: 68 05 00 00 00 pushq $0x5
108b: e9 90 ff ff ff jmpq 1020 <.plt>
0000000000001090 :
1090: ff 25 32 cf 00 00 jmpq *0xcf32(%rip) # dfc8
1096: 68 06 00 00 00 pushq $0x6
109b: e9 80 ff ff ff jmpq 1020 <.plt>
00000000000010a0 :
10a0: ff 25 2a cf 00 00 jmpq *0xcf2a(%rip) # dfd0
10a6: 68 07 00 00 00 pushq $0x7
10ab: e9 70 ff ff ff jmpq 1020 <.plt>
Disassembly of section .plt.got:
00000000000010b0 <__cxa_finalize@plt>:
10b0: ff 25 42 cf 00 00 jmpq *0xcf42(%rip) # dff8 <__cxa_finalize@GLIBC_2.2.5>
10b6: 66 90 xchg %ax,%ax
Disassembly of section .text:
00000000000010c0 :
10c0: f3 0f 1e fa endbr64
10c4: 41 57 push %r15
10c6: 41 56 push %r14
10c8: 41 55 push %r13
10ca: 41 54 push %r12
10cc: 41 89 fc mov %edi,%r12d
10cf: bf 40 00 00 00 mov $0x40,%edi
10d4: 55 push %rbp
10d5: 48 89 f5 mov %rsi,%rbp
10d8: be 00 10 00 00 mov $0x1000,%esi
10dd: 53 push %rbx
10de: 48 83 ec 58 sub $0x58,%rsp
10e2: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
10e9: 00 00
10eb: 48 89 44 24 48 mov %rax,0x48(%rsp)
10f0: 31 c0 xor %eax,%eax
10f2: e8 a9 ff ff ff callq 10a0
10f7: 66 0f 6f 0d 91 af 00 movdqa 0xaf91(%rip),%xmm1 # c090 <_IO_stdin_used+0x1090>
10fe: 00
10ff: 66 0f 6f 25 99 af 00 movdqa 0xaf99(%rip),%xmm4 # c0a0 <_IO_stdin_used+0x10a0>
1106: 00
1107: 48 89 05 f2 cf 00 00 mov %rax,0xcff2(%rip) # e100
110e: 66 0f 6f 1d 9a af 00 movdqa 0xaf9a(%rip),%xmm3 # c0b0 <_IO_stdin_used+0x10b0>
1115: 00
1116: 48 8d 90 00 10 00 00 lea 0x1000(%rax),%rdx
111d: 0f 1f 00 nopl (%rax)
1120: 66 0f 6f c1 movdqa %xmm1,%xmm0
1124: 48 83 c0 10 add $0x10,%rax
1128: 66 0f d4 cc paddq %xmm4,%xmm1
112c: 66 0f 6f d0 movdqa %xmm0,%xmm2
1130: 66 0f d4 d3 paddq %xmm3,%xmm2
1134: 0f c6 c2 88 shufps $0x88,%xmm2,%xmm0
1138: 0f 29 40 f0 movaps %xmm0,-0x10(%rax)
113c: 48 39 c2 cmp %rax,%rdx
113f: 75 df jne 1120
1141: 49 be 00 eb 08 bf 01 movabs $0x1bf08eb00,%r14
1148: 00 00 00
114b: 41 83 fc 02 cmp $0x2,%r12d
114f: 0f 8f db 35 00 00 jg 4730
1155: 4c 8d 2d 94 cf 00 00 lea 0xcf94(%rip),%r13 # e0f0 <__cpu_model>
115c: 41 f6 45 0d 02 testb $0x2,0xd(%r13)
1161: 0f 85 a7 35 00 00 jne 470e
1167: 41 f6 45 0d 04 testb $0x4,0xd(%r13)
116c: 0f 85 7a 35 00 00 jne 46ec
1172: 41 f6 45 0e 02 testb $0x2,0xe(%r13)
1177: 0f 85 4d 35 00 00 jne 46ca
117d: b8 07 00 00 00 mov $0x7,%eax
1182: 31 c9 xor %ecx,%ecx
1184: 0f a2 cpuid
1186: 81 e3 00 00 01 00 and $0x10000,%ebx
118c: 0f 85 af 1f 00 00 jne 3141
1192: 41 83 fc 01 cmp $0x1,%r12d
1196: 0f 84 9c 47 00 00 je 5938
119c: f2 0f 10 05 dc ae 00 movsd 0xaedc(%rip),%xmm0 # c080 <_IO_stdin_used+0x1080>
11a3: 00
11a4: bf 01 00 00 00 mov $0x1,%edi
11a9: b8 01 00 00 00 mov $0x1,%eax
11ae: 48 8d 35 c3 a5 00 00 lea 0xa5c3(%rip),%rsi # b778 <_IO_stdin_used+0x778>
11b5: e8 c6 fe ff ff callq 1080 <__printf_chk@plt>
11ba: f3 0f 10 35 c6 ae 00 movss 0xaec6(%rip),%xmm6 # c088 <_IO_stdin_used+0x1088>
11c1: 00
11c2: f3 0f 11 74 24 0c movss %xmm6,0xc(%rsp)
11c8: f3 0f 11 74 24 08 movss %xmm6,0x8(%rsp)
11ce: 41 83 fc 01 cmp $0x1,%r12d
11d2: 0f 8e 6e 17 00 00 jle 2946
11d8: 4c 8b 6d 08 mov 0x8(%rbp),%r13
11dc: ba 05 00 00 00 mov $0x5,%edx
11e1: 48 8d 35 86 a5 00 00 lea 0xa586(%rip),%rsi # b76e <_IO_stdin_used+0x76e>
11e8: 4c 89 ef mov %r13,%rdi
11eb: e8 40 fe ff ff callq 1030
11f0: 85 c0 test %eax,%eax
11f2: 0f 85 a3 17 00 00 jne 299b
11f8: 48 8d 35 79 5b 00 00 lea 0x5b79(%rip),%rsi # 6d78
11ff: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1205: 4c 89 f7 mov %r14,%rdi
1208: e8 33 98 00 00 callq aa40
120d: bf 01 00 00 00 mov $0x1,%edi
1212: b8 01 00 00 00 mov $0x1,%eax
1217: 48 8d 35 2e a5 00 00 lea 0xa52e(%rip),%rsi # b74c <_IO_stdin_used+0x74c>
121e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
1222: e8 59 fe ff ff callq 1080 <__printf_chk@plt>
1227: 4c 8b 6d 08 mov 0x8(%rbp),%r13
122b: ba 05 00 00 00 mov $0x5,%edx
1230: 48 8d 35 8e 9f 00 00 lea 0x9f8e(%rip),%rsi # b1c5 <_IO_stdin_used+0x1c5>
1237: 4c 89 ef mov %r13,%rdi
123a: e8 f1 fd ff ff callq 1030
123f: 85 c0 test %eax,%eax
1241: 0f 85 70 17 00 00 jne 29b7
1247: 48 8d 35 f1 5a 00 00 lea 0x5af1(%rip),%rsi # 6d3f
124e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1254: 4c 89 f7 mov %r14,%rdi
1257: e8 e4 97 00 00 callq aa40
125c: bf 01 00 00 00 mov $0x1,%edi
1261: b8 01 00 00 00 mov $0x1,%eax
1266: 48 8d 35 c4 a4 00 00 lea 0xa4c4(%rip),%rsi # b731 <_IO_stdin_used+0x731>
126d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
1271: e8 0a fe ff ff callq 1080 <__printf_chk@plt>
1276: 4c 8b 6d 08 mov 0x8(%rbp),%r13
127a: ba 03 00 00 00 mov $0x3,%edx
127f: 48 8d 35 89 9f 00 00 lea 0x9f89(%rip),%rsi # b20f <_IO_stdin_used+0x20f>
1286: 4c 89 ef mov %r13,%rdi
1289: e8 a2 fd ff ff callq 1030
128e: 85 c0 test %eax,%eax
1290: 0f 85 3d 17 00 00 jne 29d3
1296: 48 8d 35 01 5b 00 00 lea 0x5b01(%rip),%rsi # 6d9e
129d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
12a3: 4c 89 f7 mov %r14,%rdi
12a6: e8 95 97 00 00 callq aa40
12ab: bf 01 00 00 00 mov $0x1,%edi
12b0: b8 01 00 00 00 mov $0x1,%eax
12b5: 48 8d 35 61 a4 00 00 lea 0xa461(%rip),%rsi # b71d <_IO_stdin_used+0x71d>
12bc: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
12c0: e8 bb fd ff ff callq 1080 <__printf_chk@plt>
12c5: 4c 8b 6d 08 mov 0x8(%rbp),%r13
12c9: 48 8d 35 fb 9e 00 00 lea 0x9efb(%rip),%rsi # b1cb <_IO_stdin_used+0x1cb>
12d0: 4c 89 ef mov %r13,%rdi
12d3: e8 88 fd ff ff callq 1060
12d8: 85 c0 test %eax,%eax
12da: 0f 85 0a 17 00 00 jne 29ea
12e0: 4c 8d 3d 43 5b 00 00 lea 0x5b43(%rip),%r15 # 6e2a
12e7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
12ed: 4c 89 f7 mov %r14,%rdi
12f0: 4c 89 fe mov %r15,%rsi
12f3: e8 48 97 00 00 callq aa40
12f8: bf 01 00 00 00 mov $0x1,%edi
12fd: b8 01 00 00 00 mov $0x1,%eax
1302: 48 8d 35 f7 a3 00 00 lea 0xa3f7(%rip),%rsi # b700 <_IO_stdin_used+0x700>
1309: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
130d: e8 6e fd ff ff callq 1080 <__printf_chk@plt>
1312: 4c 8b 6d 08 mov 0x8(%rbp),%r13
1316: 48 8d 35 b5 9e 00 00 lea 0x9eb5(%rip),%rsi # b1d2 <_IO_stdin_used+0x1d2>
131d: 4c 89 ef mov %r13,%rdi
1320: e8 3b fd ff ff callq 1060
1325: 85 c0 test %eax,%eax
1327: 0f 85 d4 16 00 00 jne 2a01
132d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1333: 4c 89 fe mov %r15,%rsi
1336: 4c 89 f7 mov %r14,%rdi
1339: e8 02 97 00 00 callq aa40
133e: bf 01 00 00 00 mov $0x1,%edi
1343: b8 01 00 00 00 mov $0x1,%eax
1348: 48 8d 35 94 a3 00 00 lea 0xa394(%rip),%rsi # b6e3 <_IO_stdin_used+0x6e3>
134f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
1353: e8 28 fd ff ff callq 1080 <__printf_chk@plt>
1358: 4c 8b 6d 08 mov 0x8(%rbp),%r13
135c: ba 06 00 00 00 mov $0x6,%edx
1361: 48 8d 35 73 9e 00 00 lea 0x9e73(%rip),%rsi # b1db <_IO_stdin_used+0x1db>
1368: 4c 89 ef mov %r13,%rdi
136b: e8 c0 fc ff ff callq 1030
1370: 85 c0 test %eax,%eax
1372: 0f 85 a5 16 00 00 jne 2a1d
1378: 48 8d 35 c8 91 00 00 lea 0x91c8(%rip),%rsi # a547
137f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1385: 4c 89 f7 mov %r14,%rdi
1388: e8 b3 96 00 00 callq aa40
138d: bf 01 00 00 00 mov $0x1,%edi
1392: b8 01 00 00 00 mov $0x1,%eax
1397: 48 8d 35 27 a3 00 00 lea 0xa327(%rip),%rsi # b6c5 <_IO_stdin_used+0x6c5>
139e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
13a2: e8 d9 fc ff ff callq 1080 <__printf_chk@plt>
13a7: 4c 8b 6d 08 mov 0x8(%rbp),%r13
13ab: ba 08 00 00 00 mov $0x8,%edx
13b0: 48 8d 35 22 9e 00 00 lea 0x9e22(%rip),%rsi # b1d9 <_IO_stdin_used+0x1d9>
13b7: 4c 89 ef mov %r13,%rdi
13ba: e8 71 fc ff ff callq 1030
13bf: 85 c0 test %eax,%eax
13c1: 0f 85 72 16 00 00 jne 2a39
13c7: 48 8d 35 ee 91 00 00 lea 0x91ee(%rip),%rsi # a5bc
13ce: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
13d4: 4c 89 f7 mov %r14,%rdi
13d7: e8 64 96 00 00 callq aa40
13dc: bf 01 00 00 00 mov $0x1,%edi
13e1: b8 01 00 00 00 mov $0x1,%eax
13e6: 48 8d 35 3b ac 00 00 lea 0xac3b(%rip),%rsi # c028 <_IO_stdin_used+0x1028>
13ed: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
13f1: e8 8a fc ff ff callq 1080 <__printf_chk@plt>
13f6: 4c 8b 6d 08 mov 0x8(%rbp),%r13
13fa: ba 07 00 00 00 mov $0x7,%edx
13ff: 48 8d 35 dc 9d 00 00 lea 0x9ddc(%rip),%rsi # b1e2 <_IO_stdin_used+0x1e2>
1406: 4c 89 ef mov %r13,%rdi
1409: e8 22 fc ff ff callq 1030
140e: 85 c0 test %eax,%eax
1410: 0f 85 3f 16 00 00 jne 2a55
1416: 48 8d 35 e1 92 00 00 lea 0x92e1(%rip),%rsi # a6fe
141d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1423: 4c 89 f7 mov %r14,%rdi
1426: e8 15 96 00 00 callq aa40
142b: bf 01 00 00 00 mov $0x1,%edi
1430: b8 01 00 00 00 mov $0x1,%eax
1435: 48 8d 35 71 a2 00 00 lea 0xa271(%rip),%rsi # b6ad <_IO_stdin_used+0x6ad>
143c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
1440: e8 3b fc ff ff callq 1080 <__printf_chk@plt>
1445: 4c 8b 6d 08 mov 0x8(%rbp),%r13
1449: ba 07 00 00 00 mov $0x7,%edx
144e: 48 8d 35 95 9d 00 00 lea 0x9d95(%rip),%rsi # b1ea <_IO_stdin_used+0x1ea>
1455: 4c 89 ef mov %r13,%rdi
1458: e8 d3 fb ff ff callq 1030
145d: 85 c0 test %eax,%eax
145f: 0f 85 0c 16 00 00 jne 2a71
1465: 48 8d 35 c7 91 00 00 lea 0x91c7(%rip),%rsi # a633
146c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1472: 4c 89 f7 mov %r14,%rdi
1475: e8 c6 95 00 00 callq aa40
147a: bf 01 00 00 00 mov $0x1,%edi
147f: b8 01 00 00 00 mov $0x1,%eax
1484: 48 8d 35 0a a2 00 00 lea 0xa20a(%rip),%rsi # b695 <_IO_stdin_used+0x695>
148b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
148f: e8 ec fb ff ff callq 1080 <__printf_chk@plt>
1494: 4c 8b 6d 08 mov 0x8(%rbp),%r13
1498: ba 07 00 00 00 mov $0x7,%edx
149d: 48 8d 35 4e 9d 00 00 lea 0x9d4e(%rip),%rsi # b1f2 <_IO_stdin_used+0x1f2>
14a4: 4c 89 ef mov %r13,%rdi
14a7: e8 84 fb ff ff callq 1030
14ac: 85 c0 test %eax,%eax
14ae: 0f 85 d9 15 00 00 jne 2a8d
14b4: 48 8d 35 ba 92 00 00 lea 0x92ba(%rip),%rsi # a775
14bb: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
14c1: 4c 89 f7 mov %r14,%rdi
14c4: e8 77 95 00 00 callq aa40
14c9: bf 01 00 00 00 mov $0x1,%edi
14ce: b8 01 00 00 00 mov $0x1,%eax
14d3: 48 8d 35 a3 a1 00 00 lea 0xa1a3(%rip),%rsi # b67d <_IO_stdin_used+0x67d>
14da: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
14de: e8 9d fb ff ff callq 1080 <__printf_chk@plt>
14e3: 4c 8b 6d 08 mov 0x8(%rbp),%r13
14e7: ba 06 00 00 00 mov $0x6,%edx
14ec: 48 8d 35 07 9d 00 00 lea 0x9d07(%rip),%rsi # b1fa <_IO_stdin_used+0x1fa>
14f3: 4c 89 ef mov %r13,%rdi
14f6: e8 35 fb ff ff callq 1030
14fb: 85 c0 test %eax,%eax
14fd: 0f 85 a6 15 00 00 jne 2aa9
1503: 48 8d 35 6d 93 00 00 lea 0x936d(%rip),%rsi # a877
150a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
1510: 4c 89 f7 mov %r14,%rdi
1513: e8 28 95 00 00 callq aa40
1518: bf 01 00 00 00 mov $0x1,%edi
151d: b8 01 00 00 00 mov $0x1,%eax
1522: 48 8d 35 3d a1 00 00 lea 0xa13d(%rip),%rsi # b666 <_IO_stdin_used+0x666>
1529: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
152d: e8 4e fb ff ff callq 1080 <__printf_chk@plt>
1532: 4c 8b 6d 08 mov 0x8(%rbp),%r13
1536: ba 06 00 00 00 mov $0x6,%edx
153b: 48 8d 35 bf 9c 00 00 lea 0x9cbf(%rip),%rsi # b201 <_IO_stdin_used+0x201>
1542: 4c 89 ef mov %r13,%rdi
1545: e8 e6 fa ff ff callq 1030
154a: 85 c0 test %eax,%eax
154c: 0f 85 73 15 00 00 jne 2ac5
1552: 48 8d 35 95 93 00 00 lea 0x9395(%rip),%rsi # a8ee
1559: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
155f: 4c 89 f7 mov %r14,%rdi
1562: e8 d9 94 00 00 callq aa40
1567: bf 01 00 00 00 mov $0x1,%edi
156c: b8 01 00 00 00 mov $0x1,%eax
1571: 48 8d 35 d7 a0 00 00 lea 0xa0d7(%rip),%rsi # b64f <_IO_stdin_used+0x64f>
1578: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
157c: e8 ff fa ff ff callq 1080 <__printf_chk@plt>
1581: 4c 8b 6d 08 mov 0x8(%rbp),%r13
1585: ba 06 00 00 00 mov $0x6,%edx
158a: 48 8d 35 70 9c 00 00 lea 0x9c70(%rip),%rsi # b201 <_IO_stdin_used+0x201>
1591: 4c 89 ef mov %r13,%rdi
1594: e8 97 fa ff ff callq 1030
1599: 85 c0 test %eax,%eax
159b: 75 33 jne 15d0
159d: 48 8d 35 48 92 00 00 lea 0x9248(%rip),%rsi # a7ec
15a4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
15aa: 4c 89 f7 mov %r14,%rdi
15ad: e8 8e 94 00 00 callq aa40
15b2: bf 01 00 00 00 mov $0x1,%edi
15b7: b8 01 00 00 00 mov $0x1,%eax
15bc: 48 8d 35 3d aa 00 00 lea 0xaa3d(%rip),%rsi # c000 <_IO_stdin_used+0x1000>
15c3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0
15c7: e8 b4 fa ff ff callq 1080 <__printf_chk@plt>
15cc: 4c 8b 6d 08 mov 0x8(%rbp),%r13
15d0: ba 06 00 00 00 mov $0x6,%edx
15d5: 48 8d 35 8b a1 00 00 lea 0xa18b(%rip),%rsi # b767 <_IO_stdin_used+0x767>
15dc: 4c 89 ef mov %r13,%rdi
15df: e8 4c fa ff ff callq 1030
15e4: 85 c0 test %eax,%eax
15e6: 0f 85 f5 14 00 00 jne 2ae1
15ec: 48 8d 35 b3 56 00 00 lea 0x56b3(%rip),%rsi # 6ca6
15f3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0
15f9: 4c 89 f7 mov %r14,%rdi
15fc: e8 3f 94 00 00 callq aa40