Repository: clamchowder/Microbenchmarks Branch: master Commit: 13159d44086d Files: 315 Total size: 3.0 MB Directory structure: gitextract_nqotrkr3/ ├── .github/ │ └── workflows/ │ └── linux.yaml ├── .gitignore ├── AsmGen/ │ ├── AsmGen.csproj │ ├── AsmGen.sln │ ├── DataFiles/ │ │ ├── BranchhistTestBlock.c │ │ ├── CommonFunctions.c │ │ ├── GccBranchHistFunction.c │ │ ├── GccIndirectBranchFunction.c │ │ ├── IndirectBranchTestBlock.c │ │ └── clammicrobench.vcxproj_template │ ├── IUarchTest.cs │ ├── Program.cs │ ├── Properties/ │ │ └── launchSettings.json │ ├── README.md │ ├── UarchTest.cs │ ├── UarchTestHelpers.cs │ └── tests/ │ ├── A73RobTest.cs │ ├── AddLoopTest.cs │ ├── AddNsq.cs │ ├── AddSchedTest.cs │ ├── AddvNsq.cs │ ├── AddvSchedTest.cs │ ├── AeseSchedTest.cs │ ├── AesencNsq.cs │ ├── BranchBufferTest.cs │ ├── BranchHistoryTest.cs │ ├── BtbTest.cs │ ├── CvtSchedTest.cs │ ├── FAdd256RfTest.cs │ ├── Fadd128RfTest.cs │ ├── Fadd128SchedTest.cs │ ├── Fadd256SchedTest.cs │ ├── FaddNsq.cs │ ├── FaddSchedTest.cs │ ├── FcmpSchedTest.cs │ ├── FlagRfTest.cs │ ├── Fma256SchedTest.cs │ ├── FmovSched.cs │ ├── FmulSchedTest.cs │ ├── FpRfTest.cs │ ├── FpStoreDataNsq.cs │ ├── IdrfTest.cs │ ├── IndirectBranchTest.cs │ ├── IntRfDepStoreTest.cs │ ├── IntRfTest.cs │ ├── JsCvtNsq.cs │ ├── JsCvtSched.cs │ ├── JumpNsqTest.cs │ ├── JumpSchedTest.cs │ ├── LdqTest.cs │ ├── LeaSchedTest.cs │ ├── LoadNsq.cs │ ├── LoadSchedTest.cs │ ├── MaddSchedTest.cs │ ├── MaskRfTest.cs │ ├── MixAddJumpSched.cs │ ├── MixAddvJsCvtNsq.cs │ ├── MixAddvJsCvtSched.cs │ ├── MixBranchStoreTest.cs │ ├── MixFAdd256and32RfTest.cs │ ├── MixFpRfDepBranchTest.cs │ ├── MixFpVecRfTest.cs │ ├── MixIntRfDepBranchTest.cs │ ├── MixIntVec128RfTest.cs │ ├── MixIntrfFprfTest.cs │ ├── MixJumpStoreDataSched.cs │ ├── MixJumpStoreSchedTest.cs │ ├── MixJumpThenAddSched.cs │ ├── MixLdqStqTest.cs │ ├── MixLoadStoreDivSchedTest.cs │ ├── MixLoadStoreSchedTest.cs │ ├── MixStoreDivSchedTest.cs │ ├── MixVec512Vec256BlockRfTest.cs │ ├── MixVec512Vec256RfTest.cs │ ├── MmxRfTest.cs │ ├── MulSchedTest.cs │ ├── NopLoopTest.cs │ ├── PdepSchedTest.cs │ ├── ReturnStackTest.cs │ ├── RobTest.cs │ ├── RorSchedTest.cs │ ├── ShlSchedTest.cs │ ├── StoreDataDivNsqTest.cs │ ├── StoreDataNsqTest.cs │ ├── StoreDataSchedTest.cs │ ├── StoreDivNsqTest.cs │ ├── StoreDivSchedTest.cs │ ├── StoreNsq.cs │ ├── StoreSchedTest.cs │ ├── Stq128Test.cs │ ├── Stq512Test.cs │ ├── StqTest.cs │ ├── TakenBranchBufferTest.cs │ ├── TakenJumpSchedTest.cs │ ├── Vec512RfTest.cs │ ├── VecMulNsq.cs │ └── ZeroRobTest.cs ├── CoherencyLatency/ │ ├── CoherencyLatency.cpp │ ├── CoherencyLatency.sln │ ├── CoherencyLatency.vcxproj │ ├── Makefile │ ├── PThreadsCoherencyLatency.c │ └── c2cparse/ │ ├── Program.cs │ ├── c2cparse.csproj │ └── c2cparse.sln ├── Common/ │ ├── arch_detect.mk │ ├── ci_gpumemlatency.sh │ ├── ci_package.sh │ ├── perfmon.h │ ├── timing.c │ └── timing.h ├── CoreClockChecker/ │ ├── BoostClockChecker.c │ ├── BoostClockChecker_arm.s │ ├── BoostClockChecker_x86.s │ ├── CoreClockChecker.c │ ├── CoreClockChecker_x86.s │ ├── Makefile │ └── WinCoreClockChecker/ │ ├── CoreClockCheckFunctions.asm │ ├── WinCoreClockChecker.cpp │ ├── WinCoreClockChecker.sln │ ├── WinCoreClockChecker.vcxproj │ └── WinCoreClockChecker.vcxproj.filters ├── GpuMemLatency/ │ ├── Makefile │ ├── OpenCL/ │ │ ├── LICENSE │ │ ├── README.md │ │ ├── include/ │ │ │ └── CL/ │ │ │ ├── cl.h │ │ │ ├── cl_d3d10.h │ │ │ ├── cl_d3d11.h │ │ │ ├── cl_dx9_media_sharing.h │ │ │ ├── cl_dx9_media_sharing_intel.h │ │ │ ├── cl_egl.h │ │ │ ├── cl_ext.h │ │ │ ├── cl_ext_intel.h │ │ │ ├── cl_gl.h │ │ │ ├── cl_gl_ext.h │ │ │ ├── cl_half.h │ │ │ ├── cl_icd.h │ │ │ ├── cl_platform.h │ │ │ ├── cl_va_api_media_sharing_intel.h │ │ │ ├── cl_version.h │ │ │ └── opencl.h │ │ └── lib/ │ │ └── OpenCL.lib │ ├── atomic_test.c │ ├── bw_test.c │ ├── common.c │ ├── instruction_rate.c │ ├── instruction_rate_fp16_kernel.cl │ ├── instruction_rate_fp64_kernel.cl │ ├── instruction_rate_kernel.cl │ ├── kernel.cl │ ├── kernels/ │ │ ├── atomic_exec_latency_test.cl │ │ ├── buffer_bw_test.cl │ │ ├── c2c_atomic_exec_latency_test.cl │ │ ├── constant_unrolled_latency_test.cl │ │ ├── ldst_bw_test.cl │ │ ├── local_64_bw_test.cl │ │ ├── local_atomic_latency_test.cl │ │ ├── local_bw_test.cl │ │ ├── local_float4_bw_test.cl │ │ ├── local_unrolled_latency_test.cl │ │ ├── scalar_unrolled_latency_test.cl │ │ ├── sum_bw_test.cl │ │ ├── tex_bw_test.cl │ │ ├── tex_latency_test.cl │ │ └── unrolled_latency_test.cl │ ├── latency_test.c │ ├── local_mem_latency_kernel.cl │ ├── opencltest.c │ ├── opencltest.h │ ├── opencltest.sln │ ├── opencltest.vcxproj │ ├── opencltest.vcxproj.filters │ └── texturetest.c ├── InstructionRate/ │ ├── Makefile │ ├── arm_instructionrate.c │ ├── arm_instructionrate.s │ ├── riscv_instructionrate.c │ ├── riscv_instructionrate.s │ ├── test.s │ ├── x86_fusion.c │ ├── x86_fusion.s │ ├── x86_instructionrate.c │ └── x86_instructionrate.s ├── LICENSE ├── LoadedMemoryLatency/ │ ├── LoadedMemoryLatency/ │ │ ├── LoadedMemoryLatency.asm │ │ ├── LoadedMemoryLatency.cpp │ │ ├── LoadedMemoryLatency.sln │ │ ├── LoadedMemoryLatency.vcxproj │ │ └── LoadedMemoryLatency.vcxproj.filters │ ├── LoadedMemoryLatency.c │ ├── LoadedMemoryLatency_amd64.s │ ├── LoadedMemoryLatency_arm.s │ └── Makefile ├── Makefile ├── MemoryBandwidth/ │ ├── Makefile │ ├── MemoryBandwidth/ │ │ ├── MemoryBandwidth.cpp │ │ ├── MemoryBandwidth.sln │ │ ├── MemoryBandwidth.vcxproj │ │ ├── MemoryBandwidth.vcxproj.filters │ │ ├── MemoryBandwidthFunctions.asm │ │ └── MemoryBandwidthFunctions32.asm │ ├── MemoryBandwidth.c │ ├── MemoryBandwidth_arm.s │ ├── MemoryBandwidth_riscv.s │ ├── MemoryBandwidth_x86.s │ ├── MixedMemoryBandwidthTest/ │ │ ├── MemoryBandwidth.h │ │ ├── MemoryBandwidthFunctions.asm │ │ ├── MixedMemoryBandwidthTest.cpp │ │ ├── MixedMemoryBandwidthTest.vcxproj │ │ └── MixedMemoryBandwidthTest.vcxproj.filters │ └── README.md ├── MemoryLatency/ │ ├── Makefile │ ├── MemoryLatency.c │ ├── MemoryLatency.cpp │ ├── MemoryLatency.sln │ ├── MemoryLatency.vcxproj │ ├── MemoryLatencyFunctions.asm │ ├── MemoryLatency_arm.s │ ├── MemoryLatency_i686.s │ ├── MemoryLatency_riscv.s │ ├── MemoryLatency_x86.s │ └── README.md ├── README.md ├── mt_instructionrate/ │ ├── InstructionRateFunctions.asm │ ├── Makefile │ ├── Project1.vcxproj │ ├── Project1.vcxproj.filters │ ├── arm_mt_instructionrate.c │ ├── arm_mt_instructionrate.s │ ├── mt_instructionrate.c │ ├── mt_instructionrate.sln │ ├── ppc64_mt_instructionrate.c │ ├── ppc64_mt_instructionrate.s │ ├── x86_mt_instructionrate │ ├── x86_mt_instructionrate.c │ └── x86_mt_instructionrate.s └── svm/ ├── OpenCL/ │ ├── include/ │ │ └── CL/ │ │ ├── Utils/ │ │ │ ├── Context.h │ │ │ ├── Context.hpp │ │ │ ├── Detail.hpp │ │ │ ├── Device.hpp │ │ │ ├── Error.h │ │ │ ├── Error.hpp │ │ │ ├── ErrorCodes.h │ │ │ ├── Event.h │ │ │ ├── Event.hpp │ │ │ ├── File.h │ │ │ ├── File.hpp │ │ │ ├── InteropContext.hpp │ │ │ ├── OpenCLUtilsCpp_Export.h │ │ │ ├── OpenCLUtils_Export.h │ │ │ ├── Platform.hpp │ │ │ ├── Utils.h │ │ │ └── Utils.hpp │ │ ├── cl.h │ │ ├── cl2.hpp │ │ ├── cl_d3d10.h │ │ ├── cl_d3d11.h │ │ ├── cl_dx9_media_sharing.h │ │ ├── cl_dx9_media_sharing_intel.h │ │ ├── cl_egl.h │ │ ├── cl_ext.h │ │ ├── cl_ext_intel.h │ │ ├── cl_function_types.h │ │ ├── cl_gl.h │ │ ├── cl_gl_ext.h │ │ ├── cl_half.h │ │ ├── cl_icd.h │ │ ├── cl_layer.h │ │ ├── cl_platform.h │ │ ├── cl_va_api_media_sharing_intel.h │ │ ├── cl_version.h │ │ ├── opencl.h │ │ └── opencl.hpp │ ├── lib/ │ │ ├── OpenCL.lib │ │ ├── OpenCLExt.lib │ │ ├── OpenCLUtils.lib │ │ ├── OpenCLUtilsCpp.lib │ │ ├── OpenCLUtilsCppd.lib │ │ ├── OpenCLUtilsd.lib │ │ └── pkgconfig/ │ │ └── OpenCL.pc │ └── share/ │ ├── cmake/ │ │ ├── OpenCL/ │ │ │ ├── OpenCLConfig.cmake │ │ │ └── OpenCLConfigVersion.cmake │ │ ├── OpenCLExtensionLoader/ │ │ │ ├── OpenCLExtensionLoaderConfig.cmake │ │ │ ├── OpenCLExtensionLoaderConfigVersion.cmake │ │ │ ├── OpenCLExtensionLoaderTargets-debug.cmake │ │ │ ├── OpenCLExtensionLoaderTargets-release.cmake │ │ │ └── OpenCLExtensionLoaderTargets.cmake │ │ ├── OpenCLHeaders/ │ │ │ ├── OpenCLHeadersConfig.cmake │ │ │ ├── OpenCLHeadersConfigVersion.cmake │ │ │ └── OpenCLHeadersTargets.cmake │ │ ├── OpenCLHeadersCpp/ │ │ │ ├── OpenCLHeadersCppConfig.cmake │ │ │ ├── OpenCLHeadersCppConfigVersion.cmake │ │ │ └── OpenCLHeadersCppTargets.cmake │ │ ├── OpenCLICDLoader/ │ │ │ ├── OpenCLICDLoaderConfig.cmake │ │ │ ├── OpenCLICDLoaderConfigVersion.cmake │ │ │ ├── OpenCLICDLoaderTargets-debug.cmake │ │ │ ├── OpenCLICDLoaderTargets-release.cmake │ │ │ └── OpenCLICDLoaderTargets.cmake │ │ ├── OpenCLUtils/ │ │ │ ├── OpenCLUtilsConfig.cmake │ │ │ ├── OpenCLUtilsConfigVersion.cmake │ │ │ ├── OpenCLUtilsTargets-debug.cmake │ │ │ ├── OpenCLUtilsTargets-release.cmake │ │ │ └── OpenCLUtilsTargets.cmake │ │ └── OpenCLUtilsCpp/ │ │ ├── OpenCLUtilsCppConfig.cmake │ │ ├── OpenCLUtilsCppConfigVersion.cmake │ │ ├── OpenCLUtilsCppTargets-debug.cmake │ │ ├── OpenCLUtilsCppTargets-release.cmake │ │ └── OpenCLUtilsCppTargets.cmake │ └── pkgconfig/ │ ├── OpenCL-CLHPP.pc │ └── OpenCL-Headers.pc ├── atomic_latency_kernel.cl ├── svm.sln ├── svm.vcxproj ├── svm.vcxproj.filters └── svmtest.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/linux.yaml ================================================ name: Build Benchmarks on Ubuntu on: [push] jobs: BuildBenchmarks: # Only Ubuntu for now. runs-on: ubuntu-latest steps: - name: Install prerequisites run: sudo apt update && sudo apt -qq --assume-yes full-upgrade && sudo apt install -qq -y build-essential crossbuild-essential-arm64 gcc-riscv64-linux-gnu ocl-icd-opencl-dev opencl-headers libnuma-dev b3sum unzip - name: Wild tomfoolery attempt run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && brew install mingw-w64 - name: Check out repository code uses: actions/checkout@v3 - name: Build all benchmarks run: eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)" && make ci - name: Package benchmarks run: make package - name: b3sum run: b3sum clammarks.txz # - name: Upload package # env: # UPLOAD_KEY: ${{ secrets.UPLOAD_KEY }} # UPLOAD_URL: ${{ secrets.UPLOAD_URL }} # run: curl -X PUT -T clammarks.txz -H "$UPLOAD_KEY" "$UPLOAD_URL" ================================================ FILE: .gitignore ================================================ ## Ignore Visual Studio temporary files, build results, and ## files generated by popular Visual Studio add-ons. ## ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore # User-specific files *.rsuser *.suo *.user *.userosscache *.sln.docstates *.swp *generatednasm* *.exe MemoryBandwidth/membw_* MemoryLatency/MemoryLatency # User-specific files (MonoDevelop/Xamarin Studio) *.userprefs # Mono auto generated files mono_crash.* # Build results [Dd]ebug/ [Dd]ebugPublic/ [Rr]elease/ [Rr]eleases/ x64/ x86/ [Ww][Ii][Nn]32/ [Aa][Rr][Mm]/ [Aa][Rr][Mm]64/ bld/ [Bb]in/ [Oo]bj/ [Ll]og/ [Ll]ogs/ clammicrobench/*.asm # Visual Studio 2015/2017 cache/options directory .vs/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ # Visual Studio 2017 auto generated files Generated\ Files/ # MSTest test Results [Tt]est[Rr]esult*/ [Bb]uild[Ll]og.* # NUnit *.VisualState.xml TestResult.xml nunit-*.xml # Build Results of an ATL Project [Dd]ebugPS/ [Rr]eleasePS/ dlldata.c # Benchmark Results BenchmarkDotNet.Artifacts/ # .NET Core project.lock.json project.fragment.lock.json artifacts/ # ASP.NET Scaffolding ScaffoldingReadMe.txt # StyleCop StyleCopReport.xml # Files built by Visual Studio *_i.c *_p.c *_h.h *.ilk *.meta *.obj *.iobj *.pch *.pdb *.ipdb *.pgc *.pgd *.rsp *.sbr *.tlb *.tli *.tlh *.tmp *.tmp_proj *_wpftmp.csproj *.log *.tlog *.vspscc *.vssscc .builds *.pidb *.svclog *.scc # Chutzpah Test files _Chutzpah* # Visual C++ cache files ipch/ *.aps *.ncb *.opendb *.opensdf *.sdf *.cachefile *.VC.db *.VC.VC.opendb # Visual Studio profiler *.psess *.vsp *.vspx *.sap # Visual Studio Trace Files *.e2e # TFS 2012 Local Workspace $tf/ # Guidance Automation Toolkit *.gpState # ReSharper is a .NET coding add-in _ReSharper*/ *.[Rr]e[Ss]harper *.DotSettings.user # TeamCity is a build add-in _TeamCity* # DotCover is a Code Coverage Tool *.dotCover # AxoCover is a Code Coverage Tool .axoCover/* !.axoCover/settings.json # Coverlet is a free, cross platform Code Coverage Tool coverage*.json coverage*.xml coverage*.info # Visual Studio code coverage results *.coverage *.coveragexml # NCrunch _NCrunch_* .*crunch*.local.xml nCrunchTemp_* # MightyMoose *.mm.* AutoTest.Net/ # Web workbench (sass) .sass-cache/ # Installshield output folder [Ee]xpress/ # DocProject is a documentation generator add-in DocProject/buildhelp/ DocProject/Help/*.HxT DocProject/Help/*.HxC DocProject/Help/*.hhc DocProject/Help/*.hhk DocProject/Help/*.hhp DocProject/Help/Html2 DocProject/Help/html # Click-Once directory publish/ # Publish Web Output *.[Pp]ublish.xml *.azurePubxml # Note: Comment the next line if you want to checkin your web deploy settings, # but database connection strings (with potential passwords) will be unencrypted *.pubxml *.publishproj # Microsoft Azure Web App publish settings. Comment the next line if you want to # checkin your Azure Web App publish settings, but sensitive information contained # in these scripts will be unencrypted PublishScripts/ # NuGet Packages *.nupkg # NuGet Symbol Packages *.snupkg # The packages folder can be ignored because of Package Restore **/[Pp]ackages/* # except build/, which is used as an MSBuild target. !**/[Pp]ackages/build/ # Uncomment if necessary however generally it will be regenerated when needed #!**/[Pp]ackages/repositories.config # NuGet v3's project.json files produces more ignorable files *.nuget.props *.nuget.targets # Nuget personal access tokens and Credentials nuget.config # Microsoft Azure Build Output csx/ *.build.csdef # Microsoft Azure Emulator ecf/ rcf/ # Windows Store app package directories and files AppPackages/ BundleArtifacts/ Package.StoreAssociation.xml _pkginfo.txt *.appx *.appxbundle *.appxupload # Visual Studio cache files # files ending in .cache can be ignored *.[Cc]ache # but keep track of directories ending in .cache !?*.[Cc]ache/ # Others ClientBin/ ~$* *~ *.dbmdl *.dbproj.schemaview *.jfm *.pfx *.publishsettings orleans.codegen.cs # Including strong name files can present a security risk # (https://github.com/github/gitignore/pull/2483#issue-259490424) #*.snk # Since there are multiple workflows, uncomment next line to ignore bower_components # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) #bower_components/ # RIA/Silverlight projects Generated_Code/ # Backup & report files from converting an old project file # to a newer Visual Studio version. Backup files are not needed, # because we have git ;-) _UpgradeReport_Files/ Backup*/ UpgradeLog*.XML UpgradeLog*.htm ServiceFabricBackup/ *.rptproj.bak # SQL Server files *.mdf *.ldf *.ndf # Business Intelligence projects *.rdl.data *.bim.layout *.bim_*.settings *.rptproj.rsuser *- [Bb]ackup.rdl *- [Bb]ackup ([0-9]).rdl *- [Bb]ackup ([0-9][0-9]).rdl # Microsoft Fakes FakesAssemblies/ # GhostDoc plugin setting file *.GhostDoc.xml # Node.js Tools for Visual Studio .ntvs_analysis.dat node_modules/ # Visual Studio 6 build log *.plg # Visual Studio 6 workspace options file *.opt # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) *.vbw # Visual Studio LightSwitch build output **/*.HTMLClient/GeneratedArtifacts **/*.DesktopClient/GeneratedArtifacts **/*.DesktopClient/ModelManifest.xml **/*.Server/GeneratedArtifacts **/*.Server/ModelManifest.xml _Pvt_Extensions # Paket dependency manager .paket/paket.exe paket-files/ # FAKE - F# Make .fake/ # CodeRush personal settings .cr/personal # Python Tools for Visual Studio (PTVS) __pycache__/ *.pyc # Cake - Uncomment if you are using it # tools/** # !tools/packages.config # Tabs Studio *.tss # Telerik's JustMock configuration file *.jmconfig # BizTalk build output *.btp.cs *.btm.cs *.odx.cs *.xsd.cs # OpenCover UI analysis results OpenCover/ # Azure Stream Analytics local run output ASALocalRun/ # MSBuild Binary and Structured Log *.binlog # NVidia Nsight GPU debugger configuration file *.nvuser # MFractors (Xamarin productivity tool) working folder .mfractor/ # Local History for Visual Studio .localhistory/ # BeatPulse healthcheck temp database healthchecksdb # Backup folder for Package Reference Convert tool in Visual Studio 2017 MigrationBackup/ # Ionide (cross platform F# VS Code tools) working folder .ionide/ # Fody - auto-generated XML schema FodyWeavers.xsd # VS Code files for those working on multiple tools .vscode/* !.vscode/settings.json !.vscode/tasks.json !.vscode/launch.json !.vscode/extensions.json *.code-workspace # Local History for Visual Studio Code .history/ # Windows Installer files from build outputs *.cab *.msi *.msix *.msm *.msp # JetBrains Rider .idea/ *.sln.iml ================================================ FILE: AsmGen/AsmGen.csproj ================================================ Exe net8.0 false x64 AnyCPU;x64 Always Always Always Always Always Always ================================================ FILE: AsmGen/AsmGen.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.2.32516.85 MinimumVisualStudioVersion = 10.0.40219.1 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "AsmGen", "AsmGen.csproj", "{B8930E86-946C-4831-B088-F571E73EEDC4}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Debug|x64 = Debug|x64 Release|Any CPU = Release|Any CPU Release|x64 = Release|x64 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|Any CPU.Build.0 = Debug|Any CPU {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.ActiveCfg = Debug|x64 {B8930E86-946C-4831-B088-F571E73EEDC4}.Debug|x64.Build.0 = Debug|x64 {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.ActiveCfg = Release|Any CPU {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|Any CPU.Build.0 = Release|Any CPU {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.ActiveCfg = Release|x64 {B8930E86-946C-4831-B088-F571E73EEDC4}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {4433D029-CD62-44B9-862E-A8DE52DA45CE} EndGlobalSection EndGlobal ================================================ FILE: AsmGen/DataFiles/BranchhistTestBlock.c ================================================ uint32_t testSizeCount = sizeof(branchHistoryLengths) / sizeof(int); initializeBranchHistFuncArr(); srand(time(NULL)); size_t resultSize = sizeof(float) * maxBranchCount * testSizeCount; float* randomResults = (float*)malloc(resultSize); float* predictableResults = (float*)malloc(resultSize); for (uint32_t branchCountIdx = 0; branchCountIdx < maxBranchCount; branchCountIdx++) { for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { uint32_t testSize = branchHistoryLengths[testSizeIdx]; uint32_t branchCount = branchCounts[branchCountIdx]; printf("Testing branch count %d history length %d\n", branchCount, testSize); randomResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 1); predictableResults[branchCountIdx * testSizeCount + testSizeIdx] = runBranchHistTest(testSize, branchCountIdx, 0); printf("%d, %f, %f\n", testSize, randomResults[branchCountIdx * testSizeCount + testSizeIdx], predictableResults[branchCountIdx * testSizeCount + testSizeIdx]); } } printf("Random:\n"); printResultFloatArr(randomResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount); printf("\nPredictable:\n"); printResultFloatArr(predictableResults, branchHistoryLengths, testSizeCount, branchCounts, maxBranchCount); free(randomResults); free(predictableResults); ================================================ FILE: AsmGen/DataFiles/CommonFunctions.c ================================================ // this is a partial C file that's appended into generated code // stuff here is generic enough to work for both windows/vs and gcc #ifndef __MINGW32__ // optional affinity setting for effed up qualcomm/android bs #include #include #include #include #include void setAffinity(int core) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(core, &cpuset); printf("Set affinity to core %d\n", core); // sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); } #endif struct ThreadData { int* A; int* B; float* fpArr; uint32_t list_size; uint64_t structIterations; }; void printCsvHeader(uint32_t* xCounts, uint32_t xLen) { printf("x"); for (uint32_t testSizeIdx = 0; testSizeIdx < xLen; testSizeIdx++) { printf(", %d", xCounts[testSizeIdx]); } printf("\n"); } // print results in format that excel can take void printResultFloatArr(float* arr, uint32_t *xCounts, uint32_t xLen, uint32_t *yCounts, uint32_t yLen) { uint32_t testSizeCount = xLen; printCsvHeader(xCounts, xLen); for (uint32_t branchCountIdx = 0; branchCountIdx < yLen; branchCountIdx++) { // row header printf("%d", yCounts[branchCountIdx]); for (uint32_t testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { printf(",%f", arr[branchCountIdx * testSizeCount + testSizeIdx]); } printf("\n"); } } void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) { uint32_t increment = byte_increment / sizeof(uint32_t); uint32_t element_count = list_size / increment; for (int i = 0; i < element_count; i++) { pattern_arr[i * increment] = i * increment; } int iter = element_count; while (iter > 1) { iter -= 1; int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); uint32_t tmp = pattern_arr[iter * increment]; pattern_arr[iter * increment] = pattern_arr[j * increment]; pattern_arr[j * increment] = tmp; } } ================================================ FILE: AsmGen/DataFiles/GccBranchHistFunction.c ================================================ // this is a partial C file that's appended into generated code // Run a test, return the result in time (ns) per branch // historyLen: length of random array that the test loops through // branchCountIdx: index into array of branch counts, max determined by generated header/asm // random: if 1, randomize test array contents. If 0, fill with zeroes float runBranchHistTest(uint32_t historyLen, uint32_t branchCountIdx, int random) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint32_t branchCount = branchCounts[branchCountIdx]; uint64_t iterations = 320000000 / branchCount; uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t) __attribute((sysv_abi)) = branchtestFuncArr[branchCountIdx]; float onesCount = 0.0f; uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount); for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) { uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * historyLen); for (uint32_t i = 0; i < historyLen; i++) { testArr[i] = random ? rand() % 2 : 0; if (testArr[i] > 0) { onesCount += 1.0f; } } testArrToArr[testArrIdx] = testArr; } fprintf(stderr, "Starting test, should have %0.2f percent ones\n", onesCount / ((float)historyLen * branchCount)); gettimeofday(&startTv, &startTz); uint64_t takenBranchCount = branchtestFunc(iterations, testArrToArr, historyLen); gettimeofday(&endTv, &endTz); uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float latency = 1e6 * (float)time_diff_ms / (float)iterations; // give result in latency per branch latency = latency / branchCount; fprintf(stderr, "History length %u, branch count %u: %0.2f percent not-taken\n", historyLen, branchCount, 100 * (float)takenBranchCount / ((float)iterations * branchCount)); for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]); free(testArrToArr); return latency; } ================================================ FILE: AsmGen/DataFiles/GccIndirectBranchFunction.c ================================================ // similar but for indirect branch test // needs indirectBranchTestFuncArr generated // mode: // 0 - cycle through targets // 1 - random target selection // 2 - jump to middle float runIndirectBranchTest(uint32_t branchCountIdx, uint32_t targetCountIdx, uint32_t mode) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint32_t branchCount = indirectBranchCounts[branchCountIdx]; uint32_t targetCount = indirectBranchTargetCounts[targetCountIdx]; uint64_t iterations = 80000000 / branchCount; uint64_t(*branchtestFunc)(uint64_t, uint32_t**, uint32_t, uint64_t **) __attribute((sysv_abi)) = indirectBranchTestFuncArr[branchCountIdx][targetCountIdx]; // generate an array containing jump target indexes for every branch uint32_t** testArrToArr = (uint32_t**)malloc(sizeof(uint32_t*) * branchCount); for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) { uint32_t* testArr = (uint32_t*)malloc(sizeof(uint32_t) * targetCount); if (mode == 1) for (uint32_t i = 0; i < targetCount; i++) testArr[i] = rand() % targetCount; else if (mode == 0) for (uint32_t i = 0; i < targetCount; i++) testArr[i] = i; else if (mode == 2) for (uint32_t i = 0; i < targetCount; i++) testArr[i] = targetCount / 2; testArrToArr[testArrIdx] = testArr; } // each branch needs a jump table uint64_t** jumpTables = (uint64_t**)malloc(sizeof(uint64_t*) * branchCount); for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++) { uint64_t* jumpTable = (uint64_t*)malloc(sizeof(uint64_t) * targetCount); jumpTables[jumpTableIdx] = jumpTable; } gettimeofday(&startTv, &startTz); // uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch branchtestFunc(iterations, testArrToArr, targetCount, jumpTables); gettimeofday(&endTv, &endTz); uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float latency = 1e6 * (float)time_diff_ms / (float)iterations; // give result in latency per branch latency = latency / branchCount; for (int testArrIdx = 0; testArrIdx < branchCount; testArrIdx++) free(testArrToArr[testArrIdx]); free(testArrToArr); for (int jumpTableIdx = 0; jumpTableIdx < branchCount; jumpTableIdx++) free(jumpTables[jumpTableIdx]); free(jumpTables); return latency; } ================================================ FILE: AsmGen/DataFiles/IndirectBranchTestBlock.c ================================================ // generated code will have: // - indirectBranchTargetCounts = array containing # of targets per branch // - indirectBranchCounts = array containing # of branches to test // - maxIndirectBranchCount = length of ^^ // - initializeIndirectBranchFuncArr = populates uint32_t testSizeCount = sizeof(indirectBranchTargetCounts) / sizeof(int); initializeIndirectBranchFuncArr(); srand(time(NULL)); size_t resultSize = sizeof(float) * maxIndirectBranchCount * testSizeCount; float* results = (float*)malloc(resultSize); float* refResults = (float*)malloc(resultSize); for (uint32_t branchCountIdx = 0; branchCountIdx < maxIndirectBranchCount; branchCountIdx++) { for (uint32_t targetCountIdx = 0; targetCountIdx < testSizeCount; targetCountIdx++) { uint32_t testSize = indirectBranchTargetCounts[targetCountIdx]; uint32_t branchCount = indirectBranchCounts[branchCountIdx]; printf("Testing branch count %d target count %d:", branchCount, testSize); results[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 0); refResults[branchCountIdx * testSizeCount + targetCountIdx] = runIndirectBranchTest(branchCountIdx, targetCountIdx, 2); printf("%f ns, reference %f ns\n", results[branchCountIdx * testSizeCount + targetCountIdx], refResults[branchCountIdx * testSizeCount + targetCountIdx]); } } printf("Indirect branch results:\n"); printResultFloatArr(results, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount); printf("Reference indirect branch results:\n"); printResultFloatArr(refResults, indirectBranchTargetCounts, testSizeCount, indirectBranchCounts, maxIndirectBranchCount); free(results); free(refResults); ================================================ FILE: AsmGen/DataFiles/clammicrobench.vcxproj_template ================================================ Debug Win32 Release Win32 Debug x64 Release x64 16.0 Win32Proj {7e8cf2ba-57a7-4b42-b721-97e02bf9a8b8} clammicrobench 10.0 Application true v142 Unicode Application false v142 true Unicode Application true v142 Unicode Application false v142 true Unicode true false true false Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true %REPLACEWITHCUSTOMBUILD% ================================================ FILE: AsmGen/IUarchTest.cs ================================================ using System.Text; namespace AsmGen { public interface IUarchTest { public const string ThreadLaunchFunctionPrefix = "ThreadLaunch_"; // enough to generate global lines, function calls, and let user pick from tests public string Prefix { get; } public string Description { get; } public bool DivideTimeByCount { get; } public bool SupportsIsa(ISA isa); public void GenerateAsm(StringBuilder sb, ISA isa); public void GenerateTestBlock(StringBuilder sb, ISA isa); public void GenerateAsmGlobalLines(StringBuilder sb); public void GenerateExternLines(StringBuilder sb); public enum ISA { amd64, // 64-bit x86 aarch64, // 64-bit arm mips64, // 64-bit MIPS, for loongson riscv, // 64-bit risc-v } } } ================================================ FILE: AsmGen/Program.cs ================================================ using System; using System.Collections.Generic; using System.IO; using System.Text; using System.Threading.Tasks; namespace AsmGen { class Program { public static string DataFilesDir = "DataFiles"; static int structTestIterations = 5000000; static int iterations = 100 * structTestIterations; static int latencyListSize = 131072 * 1024 / 4; // 128 MB static void Main(string[] args) { List tests = new List(); tests.Add(new BtbTest(4, BtbTest.BranchType.Unconditional)); tests.Add(new BtbTest(8, BtbTest.BranchType.Unconditional)); tests.Add(new BtbTest(16, BtbTest.BranchType.Unconditional)); tests.Add(new BtbTest(32, BtbTest.BranchType.Unconditional)); tests.Add(new BtbTest(64, BtbTest.BranchType.Unconditional)); tests.Add(new BtbTest(4, BtbTest.BranchType.Conditional)); tests.Add(new BtbTest(8, BtbTest.BranchType.Conditional)); tests.Add(new BtbTest(16, BtbTest.BranchType.Conditional)); tests.Add(new BtbTest(32, BtbTest.BranchType.Conditional)); tests.Add(new BranchHistoryTest()); List tasks = new List(); tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.amd64))); tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.aarch64))); tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.mips64))); tasks.Add(Task.Run(() => GenerateCFile(tests, IUarchTest.ISA.riscv))); tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.amd64))); tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.aarch64))); tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.mips64))); tasks.Add(Task.Run(() => GenerateAsmFile(tests, IUarchTest.ISA.riscv))); Task.WaitAll(tasks.ToArray()); GenerateMakefile(); } static void GenerateCFile(List tests, IUarchTest.ISA isa) { StringBuilder sb = new StringBuilder(); sb.AppendLine("#define _GNU_SOURCE"); sb.AppendLine("#include \n#include\n#include\n#include \n#include \n#include \n"); sb.AppendLine("#pragma GCC diagnostic ignored \"-Wattributes\""); string commonFunctions = File.ReadAllText(Path.Combine(DataFilesDir, "CommonFunctions.c")); sb.AppendLine(commonFunctions); foreach (IUarchTest test in tests) { if (test.SupportsIsa(isa)) { test.GenerateExternLines(sb); Console.WriteLine("Test " + test.Prefix + " supports ISA " + isa); } } // no indexed addressing mode on these architectures, so make sure we can do pointer // chasing with a single instruction if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv) { sb.AppendLine("extern void preplatencyarr(int *arr, uint32_t list_size);"); } AddCommonInitCode(sb, tests, isa); foreach (IUarchTest test in tests) { if (test.SupportsIsa(isa)) test.GenerateTestBlock(sb, isa); } AddCommonEndCode(sb); File.WriteAllText("clammicrobench_" + isa.ToString() + ".c", sb.ToString()); } static void GenerateAsmFile(List tests, IUarchTest.ISA isa) { string filename = "clammicrobench_" + isa.ToString() + ".s"; StringBuilder sb = new StringBuilder(); sb.AppendLine(".text"); if (isa == IUarchTest.ISA.mips64) { UarchTest.GenerateMipsPrepArrayFunction(sb); } else if (isa == IUarchTest.ISA.riscv) { UarchTest.GenerateRiscvPrepArrayFunction(sb); } File.WriteAllText(filename, sb.ToString()); sb.Clear(); foreach (IUarchTest test in tests) { if (test.SupportsIsa(isa)) { sb.Clear(); test.GenerateAsmGlobalLines(sb); test.GenerateAsm(sb, isa); File.AppendAllText(filename, sb.ToString()); } } } static void GenerateMakefile() { StringBuilder sb = new StringBuilder(); foreach (IUarchTest.ISA isa in Enum.GetValues(typeof(IUarchTest.ISA))) { sb.AppendLine(isa.ToString() + ":"); if (isa == IUarchTest.ISA.aarch64) { sb.AppendLine($"\tgcc -march=armv8.5-a+aes clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb -static"); // hack for stupid compilers that need a ton of flags to do basic things sb.AppendLine("android:"); sb.AppendLine("\tclang -march=armv8.3-a -mfpu=neon-fp-armv8 clammicrobench_aarch64.c clammicrobench_aarch64.s -o cb"); } else sb.AppendLine($"\tgcc -pthread clammicrobench_{isa.ToString()}.c clammicrobench_{isa.ToString()}.s -o cb"); } sb.AppendLine("win64:"); sb.AppendLine($"\tx86_64-w64-mingw32-gcc clammicrobench_{IUarchTest.ISA.amd64.ToString()}.c clammicrobench_{IUarchTest.ISA.amd64.ToString()}.s -o cb.exe"); sb.AppendLine("clean:"); sb.AppendLine("\trm clammicrobench_* cb"); File.WriteAllText("Makefile", sb.ToString()); } // Adds largely ISA independent initialization code that gives tests a basic foundation, // like a pointer chasing array static void AddCommonInitCode(StringBuilder sb, List tests, IUarchTest.ISA isa) { sb.AppendLine("int main(int argc, char *argv[]) {"); sb.AppendLine($" uint64_t time_diff_ms, iterations = {iterations}, structIterations = {structTestIterations}, tmp;"); sb.AppendLine(" double latency; int *A = NULL, *B = NULL; float *fpArr = NULL; char *test_name = NULL; int core_affinity = -1; int threads = 1;"); sb.AppendLine(" uint64_t tmpsink;"); sb.AppendLine(" uint32_t list_size = " + latencyListSize + ";"); // print a help message based on tests available sb.AppendLine($" printf(\"Usage: -test [test name] -listsize [latency list size = {latencyListSize}] -iterations [struct iterations = {structTestIterations}]\\n\");"); sb.AppendLine(" if (argc < 2) {"); sb.AppendLine(" printf(\"List of tests:\\n\");"); foreach (IUarchTest test in tests) { if (test.SupportsIsa(isa)) sb.AppendLine($" printf(\" {test.Prefix} - {test.Description}\\n\");"); } // args provided. parse them and run test sb.AppendLine(" } else {"); // args handling sb.AppendLine(" for (int argIdx = 1; argIdx < argc; argIdx++) {"); sb.AppendLine(" if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1;"); sb.AppendLine(" if (strncmp(arg, \"test\", 4) == 0) { argIdx++; test_name = argv[argIdx]; }"); sb.AppendLine(" if (strncmp(arg, \"iterations\", 10) == 0) { argIdx++; iterations = 100 * atoi(argv[argIdx]); }"); sb.AppendLine(" if (strncmp(arg, \"listsize\", 8) == 0) { argIdx++; list_size = atoi(argv[argIdx]); }"); sb.AppendLine(" if (strncmp(arg, \"affinity\", 8) == 0) { argIdx++; core_affinity = atoi(argv[argIdx]); }"); sb.AppendLine(" if (strncmp(arg, \"threads\", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); }"); sb.AppendLine(" }"); // end -arg handling if sb.AppendLine(" }"); // end args handling for loop sb.AppendLine(" if (test_name == NULL) { fprintf(stderr, \"No test specified\\n\"); return 0; }"); // Optional affinity setting for certain troublesome platforms // don't need a version that uses Windows affinity APIs because Windows platforms never have this issue sb.AppendLine("#ifndef __MINGW32__"); sb.AppendLine(" if (core_affinity != -1) setAffinity(core_affinity);"); sb.AppendLine("#endif"); // Generate array for pointer chasing unless we're doing a BTB test sb.AppendLine(" if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {"); GenerateLatencyTestArray(sb); sb.AppendLine(" }"); // end of ptr chasing array generation sb.AppendLine(" struct timeval startTv, endTv;"); sb.AppendLine(" struct timezone startTz, endTz;"); } static void AddCommonEndCode(StringBuilder sb) { sb.AppendLine(" free(A); free(B); free(fpArr);"); sb.AppendLine(" }"); // end else sb.AppendLine(" return 0; }"); } static void GenerateLatencyTestArray(StringBuilder sb) { // Fill list to create random access pattern sb.AppendLine(" A = (int*)malloc(sizeof(int) * list_size);"); sb.AppendLine(" srand(time(NULL));"); sb.AppendLine(" FillPatternArr(A, list_size, 64);\n"); sb.AppendLine("#ifdef _WIN32"); sb.AppendLine(" B = (int*)_aligned_malloc(sizeof(int) * list_size, 64);\n"); sb.AppendLine("#else"); sb.AppendLine(" posix_memalign((void **)&B, 64, sizeof(int) * list_size);\n"); sb.AppendLine("#endif"); sb.AppendLine(" for (int i = 0; i < list_size; i++) { B[i] = i; }\n"); sb.AppendLine("#ifdef _WIN32"); sb.AppendLine(" fpArr = (float*)_aligned_malloc(sizeof(float) * list_size, 64);\n"); sb.AppendLine("#else"); sb.AppendLine(" posix_memalign((void **)&fpArr, 64, sizeof(float) * list_size);"); sb.AppendLine("#endif"); sb.AppendLine(" for (int i = 0;i < list_size; i++) { fpArr[i] = i + .1; }\n"); } } } ================================================ FILE: AsmGen/Properties/launchSettings.json ================================================ { "profiles": { "AsmGen": { "commandName": "Project", "commandLineArgs": "autocopy" } } } ================================================ FILE: AsmGen/README.md ================================================ # Microbenchmark Generator C# project to generate C and assembly for CPU structure size benchmarks that use different code for each data point, making them impractical to write by hand. For more details on methodology for out-of-order structure size measurement, see https://blog.stuffedcow.net/2013/05/measuring-rob-capacity/ First, go to Program.cs and set the expected sizes for the structures you want to measure. The constructor for each test generally has the same (low, high, step) format. For example if you anticipate ROB capacity will be between 128 and 512 entries, you can do `tests.Add(new RobTest(128, 1, 512))` # Building Compile the project and run AsmGen.exe. That gives several output files. Compilation for Linux: `gcc clammicrobench.c clammicrobench_x86.s -o clammicrobench` for x86_64 `gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` for aarch64 `aarch64-linux-gnu-gcc clammicrobench.c clammicrobench_arm.s -o clammicrobench` to cross compile for aarch64 (for example from a fast desktop) For Windows, run `AsmGen.exe autocopy`. That copies generated files to the /clammicrobench directory, assuming it's run from the default VS output location. Then, open /clammicrobench/clammicrobench.sln and build. You need nasm in your path for that, as covered on README.md at repo root. The indirect branch test can take a while to build with nasm, so you might want to reduce the branch and target counts for that. Or just keep it commented out. # Running Generally, the syntax is `clammicrobench -test [test name] -listsize [list size for latency test] -iterations [iteration count]`. The last two parameters are optional. # Tests Running the program without parameters will spit out a list of tests and brief descriptions. Most are structure size tests. Instructions that consume certain core resources are placed between two pointer chasing loads. Once the two cache misses can't overlap, the structure being tested is full. Some tests, especially those measuring scheduler capacity, will hit a mix of instructions to see whether capacity is shared across different categories of instructions. Alongside structure size tests, AsmGen is a convenient place to put other microbenchmarks that involve generating tons of code. There are several branch predictor tests: - btb16Unconditional, etc: Creates a chain of taken branches in a loop to measure taken branch latency. Useful for showing BTB size and speed. Different distances between branches are useful because branch predictors sometimes have dtrouble tracking branches that are too close together. - btb16Conditional: Same as above but with always-taken conditional branches - branchhist - Branch history test: Generates branches that are taken or not taken in some random pattern, then increases the length of that pattern and the number of branches. Each branch is given its own pattern. This test thus tries to see how long of a pattern the branch predictor can track before getting a lot of mispredicts. - indirectbranch - Indirect branch prediction test: Varies the number of branch targets and branches to see how many total targets the indirect branch predictor can track - returnstack - Tests return prediction with a nested calls of varying depths. When the return stack overflows, you'll see an increase in time per call/return pair. ================================================ FILE: AsmGen/UarchTest.cs ================================================ using System.Runtime.Serialization; using System.Text; namespace AsmGen { public abstract class UarchTest : IUarchTest { public string Prefix { get; set; } public string Description { get; set; } public int[] Counts; public string FunctionDefinitionParameters { get; set; } public string GetFunctionCallParameters { get; set; } public bool DivideTimeByCount { get; set; } public abstract bool SupportsIsa(IUarchTest.ISA isa); public abstract void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa); public void GenerateAsmGlobalLines(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) sb.AppendLine(".global " + Prefix + Counts[i]); } public void GenerateExternLines(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) { sb.AppendLine("extern uint64_t " + Prefix + Counts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));"); // Function that can be launched in a pthread sb.AppendLine($"void *{IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}(void *pa)"); sb.AppendLine("{"); sb.AppendLine(" struct ThreadData *td = (struct ThreadData *)pa;"); sb.AppendLine(" int *A = td->A;"); sb.AppendLine(" int *B = td->B;"); sb.AppendLine(" float *fpArr = td->fpArr;"); sb.AppendLine(" uint32_t list_size = td->list_size;"); sb.AppendLine(" int structIterations = td->structIterations;"); sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});"); sb.AppendLine(" return NULL;"); sb.AppendLine("}"); } } public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa) { sb.AppendLine(" if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {"); sb.AppendLine(" printf(\"" + Description + ":\\n\");"); if (isa == IUarchTest.ISA.mips64 || isa == IUarchTest.ISA.riscv) { sb.AppendLine(" if (argc == 1 || argc > 1 && strncmp(test_name, \"btb\", 3) != 0) {"); sb.AppendLine("preplatencyarr(A, list_size);"); sb.AppendLine(" }"); } for (int i = 0; i < Counts.Length; i++) { // use more iterations (iterations = structIterations * 100) and divide iteration count by tested-thing count // for certain tests like call stack depth if (DivideTimeByCount) { sb.AppendLine(" tmp = structIterations;"); sb.AppendLine(" structIterations = iterations / " + Counts[i] + ";"); } sb.AppendLine(" gettimeofday(&startTv, &startTz);"); sb.AppendLine("#ifndef __MINGW32__"); sb.AppendLine(" if (threads > 1) {"); sb.AppendLine(" struct ThreadData testThreadData;"); sb.AppendLine(" pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t));"); sb.AppendLine(" testThreadData.A = A;"); sb.AppendLine(" testThreadData.B = B;"); sb.AppendLine(" testThreadData.fpArr = fpArr;"); sb.AppendLine(" testThreadData.list_size = list_size;"); sb.AppendLine(" testThreadData.structIterations = structIterations;"); sb.AppendLine(" for (int threadIdx = 0; threadIdx < threads; threadIdx++) {"); sb.AppendLine($" pthread_create(testThreads + threadIdx, NULL, {IUarchTest.ThreadLaunchFunctionPrefix}{Prefix}{Counts[i]}, &testThreadData);"); sb.AppendLine(" }"); sb.AppendLine(" for (int threadIdx = 0; threadIdx < threads; threadIdx++) {"); sb.AppendLine(" pthread_join(testThreads[threadIdx], NULL);"); sb.AppendLine(" }"); sb.AppendLine(" free(testThreads);"); // launch threads sb.AppendLine(" } else "); sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});"); sb.AppendLine("#else"); sb.AppendLine(" " + Prefix + Counts[i] + $"({GetFunctionCallParameters});"); sb.AppendLine("#endif"); sb.AppendLine(" gettimeofday(&endTv, &endTz);"); sb.AppendLine(" time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000);"); //sb.AppendLine(" fprintf(stderr, \"%lu ms elapsed, %lu iter\\n\", time_diff_ms, structIterations);"); if (DivideTimeByCount) sb.AppendLine(" latency = 1e6 * (float)time_diff_ms / (float)(iterations);"); else sb.AppendLine(" latency = 1e6 * (float)time_diff_ms / (float)(structIterations);"); sb.AppendLine(" printf(\"" + Counts[i] + ",%f\\n\", latency);\n"); if (DivideTimeByCount) { sb.AppendLine(" structIterations = tmp;"); } } sb.AppendLine(" }\n"); } /// /// MIPS doesn't have an indexed load instruction which means we'd have to use an /// add+shift (extra two instructions), which would complicate measurements /// So screw around in order to use direct addressing /// /// public static void GenerateMipsPrepArrayFunction(StringBuilder sb) { // r4 = ptr to arr, r5 = arr len, in 32-bit elements sb.AppendLine(".global preplatencyarr"); sb.AppendLine("preplatencyarr:"); sb.AppendLine(" xor $r12, $r12, $r12"); sb.AppendLine(" xor $r13, $r13, $r13"); sb.AppendLine(" xor $r14, $r14, $r14"); sb.AppendLine(" xor $r15, $r15, $r15"); // array index sb.AppendLine(" addi.d $r14, $r14, 1"); sb.AppendLine("preplatencyarr_loop:"); sb.AppendLine(" alsl.d $r12, $r15, $r0, 0x3"); // shift by 3 = multiply by 8 for 64-bit sb.AppendLine(" add.d $r12, $r4, $r12"); // add loaded value to base address sb.AppendLine(" ld.d $r13, $r12, 0"); sb.AppendLine(" alsl.d $r13, $r13, $r0, 0x2"); // address calculation for loaded index. this is in 32-bit values sb.AppendLine(" add.d $r13, $r4, $r13"); sb.AppendLine(" st.d $r13, $r12, 0"); // save calculated address sb.AppendLine(" add.d $r15, $r15, $r14"); sb.AppendLine(" alsl.d $r16, $r15, $r0, 0x1"); // muliply 64-bit index by 2 to prevent out of bounds for 32-bit list size count sb.AppendLine(" bne $r16, $r5, preplatencyarr_loop"); // while idx != len sb.AppendLine(" jr $r1"); } public static void GenerateRiscvPrepArrayFunction(StringBuilder sb) { sb.AppendLine(".global preplatencyarr"); sb.AppendLine("preplatencyarr:"); sb.AppendLine(" li x7, 0"); sb.AppendLine(" mv x5, x10"); sb.AppendLine("preplatencyarr_loop:"); sb.AppendLine(" ld x28, (x5)"); sb.AppendLine(" slli x28, x28, 2"); // index specified in 32-bit values sb.AppendLine(" add x28, x28, x10"); sb.AppendLine(" sd x28, (x5)"); sb.AppendLine(" addi x5, x5, 8"); // next element sb.AppendLine(" addi x7, x7, 2"); // list size is given in 32-bit elements sb.AppendLine(" blt x7, x11, preplatencyarr_loop"); sb.AppendLine(" ret"); } } } ================================================ FILE: AsmGen/UarchTestHelpers.cs ================================================ using System.IO; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace AsmGen { public static class UarchTestHelpers { public static int[] GenerateCountArray(int low, int high, int step) { List countList = new List(); for (int i = low; i <= high; i += step) { countList.Add(i); } return countList.ToArray(); } public static void GenerateNasmGlobalLines(StringBuilder sb, UarchTest test) { int[] counts = test.Counts; for (int i = 0; i < counts.Length; i++) sb.AppendLine("global " + test.Prefix + counts[i]); } public static void GenerateAsmGlobalLines(StringBuilder sb, UarchTest test) { int[] counts = test.Counts; for (int i = 0; i < counts.Length; i++) sb.AppendLine(".global " + test.Prefix + counts[i]); } public static void GenerateExternLines(StringBuilder sb, UarchTest test) { int[] counts = test.Counts; for (int i = 0; i < counts.Length; i++) sb.AppendLine("extern uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters}) __attribute((sysv_abi));"); ; } public static void GenerateVsExternLines(StringBuilder sb, UarchTest test) { int[] counts = test.Counts; for (int i = 0; i < counts.Length; i++) sb.AppendLine("extern \"C\" uint64_t " + test.Prefix + counts[i] + $"({test.FunctionDefinitionParameters});"); } /// /// Generates test functions in assembly, with filler instructions between two divs /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention /// /// StringBuilder to append to /// Sizes to test the structure at /// Function name prefix /// Filler instructions after first ptr chasing load /// Filler instructions after second ptr chasing load /// If true, count pointer chasing loads as consuming the tested resource /// (i.e. ptr chasing loads consume a ROB and integer RF slot) /// Any extra initialization instructions public static void GenerateX86AsmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %r9"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rcx, %r9"); // r9 <- rcx sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x10, %r14"); sb.AppendLine(" mov $0x20, %r13"); sb.AppendLine(" mov $0x30, %r12"); sb.AppendLine(" mov $0x40, %r11"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov %rdx, %rdi"); sb.AppendLine(" mov %rdx, %rsi"); sb.AppendLine("\n" + funcName + "start:"); // keep dividing list size by itself sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" mov %rdi, %rax"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" sub %rax, %rsi"); sb.AppendLine(" inc %rsi"); // rdx is the remainder, rax is the quotient int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" mov %rsi, %rax"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" sub %rax, %rdi"); sb.AppendLine(" inc %rdi"); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r9"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } public static void GenerateX86AsmDivNsqTestFuncs(StringBuilder sb, int maxSize, int[] counts, string funcNamePrefix, string[] depInstrs, string[] indepInstrs, bool divsInSq = false, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x10, %r14"); sb.AppendLine(" mov $0x20, %r13"); sb.AppendLine(" mov $0x30, %r12"); sb.AppendLine(" mov $0x40, %r11"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov %rdx, %rdi"); sb.AppendLine(" mov %rdx, %rsi"); sb.AppendLine("\n" + funcName + "start:"); // keep dividing list size by itself sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" mov %rdi, %rax"); // divide rdi by rsi sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rsi"); sb.AppendLine(" sub %rax, %rsi"); sb.AppendLine(" inc %rsi"); // rdx is the remainder, rax is the quotient int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i]; for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++) { if (fillerIdx < fillerInstrCount) { sb.AppendLine(depInstrs[depInstrIdx]); depInstrIdx = (depInstrIdx + 1) % depInstrs.Length; } else { sb.AppendLine(indepInstrs[indepInstrIdx]); indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length; } } sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" mov %rsi, %rax"); // divide rsi by rdi sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" xor %rdx, %rdx"); sb.AppendLine(" idiv %rdi"); sb.AppendLine(" sub %rax, %rdi"); sb.AppendLine(" inc %rdi"); for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++) { if (fillerIdx < fillerInstrCount) { sb.AppendLine(depInstrs[depInstrIdx]); depInstrIdx = (depInstrIdx + 1) % depInstrs.Length; } else { sb.AppendLine(indepInstrs[indepInstrIdx]); indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length; } } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } public static void GenerateX86AsmStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null, string postLoadInstrs1 = null, string postLoadInstrs2 = null, bool lfence = true, string cleanupInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x2, %r13"); sb.AppendLine(" mov $0x3, %r12"); sb.AppendLine(" mov $0x4, %r11"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); if (lfence) sb.AppendLine("lfence"); else { if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); if (cleanupInstrs != null) sb.AppendLine(cleanupInstrs); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } /// /// Generate test functions to see how big a scheduler is, without a NSQ /// Dependent ops are followed by independent ops, total op count = max /// If number of dependent ops is greater than NSQ size, indep ops can't be executed and /// there will be a dispatch stall /// /// Stringbuilder to append to /// number of ops between dependent loads. must be less than RF size but greater than SQ+NSQ size /// array of data points to test (SQ sizes in this case) /// function name prefix /// /// /// Do ptr chasing loads occupy entries in the SQ being measured? public static void GenerateX86AsmNsqTestFuncs(StringBuilder sb, int totalOps, int[] counts, string funcNamePrefix, string[] dependentInstrs, string[] indepInstrs, bool ptrChasingLoadsInSq = false, string initInstrs = null, string postLoadInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x2, %r13"); sb.AppendLine(" mov $0x3, %r12"); sb.AppendLine(" mov $0x4, %r11"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs); int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < totalOps; fillerIdx++) { if (fillerIdx < sqInstrs) { sb.AppendLine(dependentInstrs[depInstrIdx]); depInstrIdx = (depInstrIdx + 1) % dependentInstrs.Length; } else { sb.AppendLine(indepInstrs[indepInstrIdx]); indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length; } } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine(" lfence"); sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } /// /// Generate test functions for testing integer scheduler capacity /// R15's value is dependent on the pointer chasing load results /// /// /// /// /// /// /// /// public static void GenerateX86AsmIntSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool divs = true, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x2, %r13"); sb.AppendLine(" mov $0x3, %r12"); sb.AppendLine(" mov $0x4, %r11"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov %rdi, %r15"); int fillerInstrCount = divs ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine(" mov %rsi, %r15"); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } /// /// Generates pointer chasing test functions in assembly, with xmm0 <- [address using offset from ptr chasing result] /// xmm1-4 can be used for /// /// /// /// /// /// public static void GenerateX86AsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x1, %r13"); sb.AppendLine(" mov $0x3, %r12"); // initialize some FP values off r8 (third argument) sb.AppendLine(" movss (%r8), %xmm1"); sb.AppendLine(" movss 4(%r8), %xmm2"); sb.AppendLine(" movss 8(%r8), %xmm3"); sb.AppendLine(" movss 12(%r8), %xmm4"); sb.AppendLine(" movss 16(%r8), %xmm5"); // start one chain at 0, and the other at 0x40 sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" cvtsi2ss %rdi, %xmm0"); int fillerInstrCount = counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine(" cvtsi2ss %rsi, %xmm0"); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } public static void GenerateX86AsmFp256SchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x1, %r13"); sb.AppendLine(" mov $0x3, %r12"); // initialize some FP values off r8 (third argument) sb.AppendLine(" vzeroupper"); sb.AppendLine(" vmovups (%r8), %ymm1"); sb.AppendLine(" vmovups 32(%r8), %ymm2"); sb.AppendLine(" vmovups 64(%r8), %ymm3"); sb.AppendLine(" vmovups 96(%r8), %ymm4"); sb.AppendLine(" vmovups 128(%r8), %ymm5"); // start one chain at 0, and the other at 0x40 sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" vbroadcastss (%r8,%rdi,4), %ymm0"); int fillerInstrCount = counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine(" vbroadcastss (%r8,%rsi,4), %ymm0"); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } /// /// Generates test functions in assembly, with filler instructions between two divs /// Args are put into rcx, rdx, r8 (in that order) to match Windows calling convention /// /// StringBuilder to append to /// Sizes to test the structure at /// Function name prefix /// Filler instructions after first ptr chasing load /// Filler instructions after second ptr chasing load /// If true, count pointer chasing loads as consuming the tested resource /// (i.e. ptr chasing loads consume a ROB and integer RF slot) /// Any extra initialization instructions public static void GenerateX86NasmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = true, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push rsi"); sb.AppendLine(" push rdi"); sb.AppendLine(" push r15"); sb.AppendLine(" push r14"); sb.AppendLine(" push r13"); sb.AppendLine(" push r12"); sb.AppendLine(" push r11"); sb.AppendLine(" xor r15, r15"); sb.AppendLine(" mov r14, 0x10"); sb.AppendLine(" mov r13, 0x20"); sb.AppendLine(" mov r12, 0x30"); sb.AppendLine(" mov r11, 0x40"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov rdi, rdx"); sb.AppendLine(" mov rsi, rdx"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" mov rax, rdi"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rsi"); sb.AppendLine(" sub rsi, rax"); sb.AppendLine(" inc rsi"); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs1[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" mov rax, rsi"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" xor rdx, rdx"); sb.AppendLine(" idiv rdi"); sb.AppendLine(" sub rdi, rax"); sb.AppendLine(" inc rdi"); for (int fillerIdx = 0, instrIdx = 0; fillerIdx < fillerInstrCount; fillerIdx++) { sb.AppendLine(fillerInstrs2[instrIdx]); instrIdx = (instrIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" dec rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop r11"); sb.AppendLine(" pop r12"); sb.AppendLine(" pop r13"); sb.AppendLine(" pop r14"); sb.AppendLine(" pop r15"); sb.AppendLine(" pop rdi"); sb.AppendLine(" pop rsi"); sb.AppendLine(" ret\n\n"); } } /// /// Generates test functions in ARM assembly. /// Registers x15-x10 can be used for integer stuff /// Args are in x0, x1, x2 /// /// /// /// /// /// /// /// use dsb as lfence public static void GenerateArmAsmStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = false, string initInstrs = null, string postLoadInstrs1 = null, string postLoadInstrs2 = null, bool dsb = true) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in x0, x1 sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); sb.AppendLine(" mov x10, 6"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++) { sb.AppendLine(fillerInstrs1[addIdx]); addIdx = (addIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); if (dsb) { sb.AppendLine(" dsb sy"); sb.AppendLine(" isb sy"); } else { if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2); for (int nopIdx = 0, addIdx = 0; nopIdx < fillerInstrCount; nopIdx++) { sb.AppendLine(fillerInstrs2[addIdx]); addIdx = (addIdx + 1) % fillerInstrs2.Length; } } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } public static void GenerateArmAsmNsqTestFuncs(StringBuilder sb, int totalOps, int[] counts, string funcNamePrefix, string[] dependentInstrs, string[] indepInstrs, bool ptrChasingLoadsInSq = false, string initInstrs = null, string postLoadInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in x0, x1 sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); sb.AppendLine(" mov x10, 6"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] if (postLoadInstrs != null) sb.AppendLine(postLoadInstrs); int sqInstrs = ptrChasingLoadsInSq ? counts[i] - 2 : counts[i]; for (int fillerIdx = 0, instrIdx = 0; fillerIdx < totalOps; fillerIdx++) { if (fillerIdx < sqInstrs) sb.AppendLine(dependentInstrs[instrIdx]); else sb.AppendLine(indepInstrs[instrIdx]); instrIdx = (instrIdx + 1) % dependentInstrs.Length; } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); sb.AppendLine(" dsb sy"); // close enough to lfence sb.AppendLine(" isb sy"); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } /// /// Filler for todo functions /// /// /// /// public static void GenerateStub(StringBuilder sb, int[] counts, string funcNamePrefix) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" ret"); } } public static void GenerateArmAsmFpSchedTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2) { GenerateArmAsmStructureTestFuncs(sb, counts, funcNamePrefix, fillerInstrs1, fillerInstrs2, false, null, " ldr s16, [x2, w25, uxtw #2]", " ldr s16, [x2, w26, uxtw #2]"); } public static void GenerateArmAsmDivStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = false, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in x0 = iterations, x1 = list size, x2 = list (sink) sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov w25, w1"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs1[addIdx]); addIdx = (addIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" mov w26, w1"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs2[addIdx]); addIdx = (addIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } // Just to deal with A73 public static string GetArmDependentBranch(string prefix) { return $" cmp x25, x26\n b.eq {prefix}_badthing"; } public static string GetArmDependentBranchTarget(string prefix) { return $"{prefix}_badthing:\n .word 0xf7f0a000"; } public static string GetRiscvDependentBranch(string prefix) { return $" beq x5, x6, {prefix}_badthing"; } public static string GetRiscvDependentBranchTarget(string prefix) { return $"{prefix}_badthing:\n .word 0x00000000"; } public static void GenerateArmAsmDivNsqTestFuncs(StringBuilder sb, int maxSize, int[] counts, string funcNamePrefix, string[] depInstrs, string[] indepInstrs, bool divsInSq = false, string initInstrs = null) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in x0 = iterations, x1 = list size, x2 = list (sink) sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov w25, w1"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); sb.AppendLine(" udiv w25, w25, w13"); int fillerInstrCount = divsInSq ? counts[i] - 6 : counts[i]; for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++) { if (fillerIdx < fillerInstrCount) { sb.AppendLine(depInstrs[depInstrIdx]); depInstrIdx = (depInstrIdx + 1) % depInstrs.Length; } else { sb.AppendLine(indepInstrs[indepInstrIdx]); indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length; } } sb.AppendLine(" mov w26, w1"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" udiv w26, w26, w13"); sb.AppendLine(" mov w25, w26"); for (int fillerIdx = 0, depInstrIdx = 0, indepInstrIdx = 0; fillerIdx < maxSize; fillerIdx++) { if (fillerIdx < fillerInstrCount) { sb.AppendLine(depInstrs[depInstrIdx]); depInstrIdx = (depInstrIdx + 1) % depInstrs.Length; } else { sb.AppendLine(indepInstrs[indepInstrIdx]); indepInstrIdx = (indepInstrIdx + 1) % indepInstrs.Length; } } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } public static void GenerateMipsAsmStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = false, string initInstrs = null, string postLoadInstrs1 = null, string postLoadInstrs2 = null, bool dsb = false) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in r4 = iterations, r5 = list, r6 = list (sink) // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" ld.d $r12, $r5, 0"); sb.AppendLine(" ld.d $r13, $r5, 64"); sb.AppendLine(" xor $r14, $r14, $r14"); sb.AppendLine(" addi.d $r14, $r14, 1"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ld.d $r12, $r12, 0"); if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs1[addIdx]); addIdx = (addIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" ld.d $r13, $r13, 0"); if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2); for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs2[addIdx]); addIdx = (addIdx + 1) % fillerInstrs2.Length; } sb.AppendLine(" sub.d $r4, $r4, $r14"); sb.AppendLine(" bnez $r4, " + funcName + "start"); sb.AppendLine(" jr $r1"); } } public static void GenerateRiscvAsmStructureTestFuncs(StringBuilder sb, int[] counts, string funcNamePrefix, string[] fillerInstrs1, string[] fillerInstrs2, bool includePtrChasingLoads = false, string initInstrs = null, string postLoadInstrs1 = null, string postLoadInstrs2 = null, bool fence = true) { for (int i = 0; i < counts.Length; i++) { string funcName = funcNamePrefix + counts[i]; // args in x10 = iterations, x11 = list, x12 = list (sink) // temporaries are x5-x7, x28-x31 // x18-27 are to be saved // use x5 and x6 for ptr chasing loads sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" addi sp, sp, -88"); sb.AppendLine(" sd x18, 0(sp)"); sb.AppendLine(" sd x19, 8(sp)"); sb.AppendLine(" sd x20, 16(sp)"); sb.AppendLine(" sd x21, 24(sp)"); sb.AppendLine(" sd x22, 32(sp)"); sb.AppendLine(" sd x23, 40(sp)"); sb.AppendLine(" sd x24, 48(sp)"); sb.AppendLine(" sd x25, 56(sp)"); sb.AppendLine(" sd x26, 64(sp)"); sb.AppendLine(" sd x27, 72(sp)"); sb.AppendLine(" addi x28, x28, 1"); sb.AppendLine(" addi x29, x29, 1"); sb.AppendLine(" addi x30, x30, 1"); sb.AppendLine(" addi x31, x31, 1"); sb.AppendLine(" addi x18, x18, 2"); sb.AppendLine(" addi x19, x19, 3"); sb.AppendLine(" addi x20, x20, 4"); sb.AppendLine(" addi x22, x21, 5"); sb.AppendLine(" ld x5, (x11)"); sb.AppendLine(" ld x6, 64(x11)"); if (initInstrs != null) sb.AppendLine(initInstrs); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ld x5, (x5)"); if (postLoadInstrs1 != null) sb.AppendLine(postLoadInstrs1); int fillerInstrCount = includePtrChasingLoads ? counts[i] - 2 : counts[i]; for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs1[addIdx]); addIdx = (addIdx + 1) % fillerInstrs1.Length; } sb.AppendLine(" ld x6, (x6)"); if (fence) sb.AppendLine(" fence"); else { if (postLoadInstrs2 != null) sb.AppendLine(postLoadInstrs2); for (int instrIdx = 0, addIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { sb.AppendLine(fillerInstrs2[addIdx]); addIdx = (addIdx + 1) % fillerInstrs2.Length; } } sb.AppendLine(" addi x10, x10, -1"); sb.AppendLine(" bge x10, x0, " + funcName + "start"); sb.AppendLine(" ld x18, 0(sp)"); sb.AppendLine(" ld x19, 8(sp)"); sb.AppendLine(" ld x20, 16(sp)"); sb.AppendLine(" ld x21, 24(sp)"); sb.AppendLine(" ld x22, 32(sp)"); sb.AppendLine(" ld x23, 40(sp)"); sb.AppendLine(" ld x24, 48(sp)"); sb.AppendLine(" ld x25, 56(sp)"); sb.AppendLine(" ld x26, 64(sp)"); sb.AppendLine(" ld x27, 72(sp)"); sb.AppendLine(" addi sp, sp, 88"); sb.AppendLine(" ret"); } } } } ================================================ FILE: AsmGen/tests/A73RobTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { /// /// Looking for reordering capacity limits on A73 by combining several different instruction types /// public class A73RobTest : UarchTest { public A73RobTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "a73rob"; this.Description = "Mixed integer/vec128 + stores"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = UarchTestHelpers.GetArmDependentBranch(this.Prefix); string initInstrs = " ldr q0, [x1]\n" + " ldr q1, [x1, #0x10]\n" + " ldr q2, [x1, #0x20]\n" + " ldr q3, [x1, #0x30]\n" + " ldr q4, [x1, #0x40]\n"; List fillerInstrs = new List(); for (int i = 0; i < this.Counts[this.Counts.Length - 1];i++) { if (i < 33) fillerInstrs.Add(" add v1.4s, v1.4s, v0.4s"); else if (i < 66) fillerInstrs.Add(" add x15, x15, x11"); else fillerInstrs.Add(" str x12, [x2]"); } string[] fillerInstrsArr = fillerInstrs.ToArray(); UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, fillerInstrsArr, fillerInstrsArr, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/AddLoopTest.cs ================================================ using System.Text; namespace AsmGen { public class AddLoopTest : UarchTest { /// /// /// /// must be greater than 2 /// /// public AddLoopTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "addloop"; this.Description = $"ADD throughput for various loop sizes. Avoids NOP fusing"; this.FunctionDefinitionParameters = "uint64_t iterations"; this.GetFunctionCallParameters = "structIterations"; this.DivideTimeByCount = true; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return false; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb); } public void GenerateX86GccAsm(StringBuilder sb) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add %r11, %r15"; unrolledAdds[1] = " add %r11, %r14"; unrolledAdds[2] = " add %r11, %r13"; unrolledAdds[3] = " add %r11, %r12"; for (int i = 0; i < Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; sb.AppendLine(funcName + ":"); // count dec, jnz as instructions in the loop for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]); sb.AppendLine(" dec %rdi"); sb.AppendLine(" jnz " + funcName); sb.AppendLine(" ret"); } } public void GenerateArmAsm(StringBuilder sb) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x15, x15, x11"; unrolledAdds[1] = " add x14, x14, x11"; unrolledAdds[2] = " add x13, x13, x11"; unrolledAdds[3] = " add x12, x12, x11"; for (int i = 0; i < Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; sb.AppendLine(funcName + ":"); for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(unrolledAdds[nopIdx & 3]); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName); sb.AppendLine(" ret"); } } } } ================================================ FILE: AsmGen/tests/AddNsq.cs ================================================ using System.Text; namespace AsmGen { public class AddNsq : UarchTest { private int totalOps; public AddNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "addnsq" + totalOps; this.Description = "Integer adds, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { // if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] depInstrs = new string[2]; depInstrs[0] = " add %rdi, %r15"; depInstrs[1] = " add %rdi, %r14"; string[] indepInstrs = new string[2]; indepInstrs[0] = " add %r13, %r11"; indepInstrs[1] = " add %r12, %r11"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false); } } } } ================================================ FILE: AsmGen/tests/AddSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class AddSchedTest : UarchTest { public AddSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "addsched"; this.Description = "Scheduler, Integer Adds"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add %rdi, %r15"; unrolledAdds[1] = " add %rdi, %r14"; unrolledAdds[2] = " add %rdi, %r13"; unrolledAdds[3] = " add %rdi, %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x15, x15, x25"; unrolledAdds[1] = " add x14, x14, x25"; unrolledAdds[2] = " add x13, x13, x25"; unrolledAdds[3] = " add x12, x12, x25"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add.d $r15, $r15, $r12"; unrolledAdds[1] = " add.d $r16, $r16, $r12"; unrolledAdds[2] = " add.d $r17, $r17, $r12"; unrolledAdds[3] = " add.d $r18, $r18, $r12"; string[] unrolledAdds1 = new string[4]; unrolledAdds1[0] = " add.d $r15, $r15, $r13"; unrolledAdds1[1] = " add.d $r16, $r16, $r13"; unrolledAdds1[2] = " add.d $r17, $r17, $r13"; unrolledAdds1[3] = " add.d $r18, $r18, $r13"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.riscv) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x30, x30, x5"; unrolledAdds[1] = " add x29, x29, x5"; unrolledAdds[2] = " add x28, x28, x5"; unrolledAdds[3] = " add x31, x31, x5"; string[] unrolledAdds1 = new string[4]; unrolledAdds1[0] = " add x30, x30, x6"; unrolledAdds1[1] = " add x31, x31, x6"; unrolledAdds1[2] = " add x28, x28, x6"; unrolledAdds1[3] = " add x29, x29, x6"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false); } } } } ================================================ FILE: AsmGen/tests/AddvNsq.cs ================================================ using System.Text; namespace AsmGen { public class AddvNsq : UarchTest { private int totalOps; public AddvNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "addvnsq"; this.Description = "ADDV, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; string initInstrs = " ldr d15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " addv h1, v16.4h"; depInstrs[1] = " addv h2, v16.4h"; depInstrs[2] = " addv h3, v16.4h"; depInstrs[3] = " addv h4, v16.4h"; string[] indepInstrs = new string[4]; indepInstrs[0] = " addv h1, v15.4h"; indepInstrs[1] = " addv h2, v15.4h"; indepInstrs[2] = " addv h3, v15.4h"; indepInstrs[3] = " addv h4, v15.4h"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/AddvSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class AddvSched : UarchTest { public AddvSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "addvsched"; this.Description = "ADDV Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]"; string postLoadInstrs2 = " ldr q16, [x2, w25, sxtw #0]"; string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " addv h1, v16.4h"; unrolledInstrs[1] = " addv h2, v16.4h"; unrolledInstrs[2] = " addv h3, v16.4h"; unrolledInstrs[3] = " addv h4, v16.4h"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/AeseSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class AeseSchedTest : UarchTest { public AeseSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "aesesched"; this.Description = "aese scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " aesenc %xmm0, %xmm1"; unrolledAdds[1] = " aesenc %xmm0, %xmm2"; unrolledAdds[2] = " aesenc %xmm0, %xmm3"; unrolledAdds[3] = " aesenc %xmm0, %xmm4"; UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]"; string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " aese v1.16b, v0.16b"; unrolledAdds[1] = " aese v2.16b, v0.16b"; unrolledAdds[2] = " aese v3.16b, v0.16b"; unrolledAdds[3] = " aese v4.16b, v0.16b"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/AesencNsq.cs ================================================ using System.Text; namespace AsmGen { public class AesencNsq : UarchTest { private int totalOps; public AesencNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "aesencnsq" + totalOps; this.Description = "AESENC, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { // if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string postLoadInstrs = " mov %rdi, %r15\n add %r8, %r15\n movdqu (%r15), %xmm1"; string initInstrs = " movdqu (%r8), %xmm2"; string[] depInstrs = new string[4]; depInstrs[0] = " aesenc %xmm1, %xmm0"; depInstrs[1] = " aesenc %xmm1, %xmm3"; depInstrs[2] = " aesenc %xmm1, %xmm4"; depInstrs[3] = " aesenc %xmm1, %xmm5"; string[] indepInstrs = new string[2]; indepInstrs[0] = " aesenc %xmm2, %xmm6"; indepInstrs[1] = " aesenc %xmm2, %xmm7"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]"; string initInstrs = " ldr s15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " fadd s0, s0, s16"; depInstrs[1] = " fadd s1, s1, s16"; depInstrs[2] = " fadd s2, s2, s16"; depInstrs[3] = " fadd s3, s3, s16"; string[] indepInstrs = new string[4]; indepInstrs[0] = " fadd s17, s17, s15"; indepInstrs[1] = " fadd s18, s18, s15"; indepInstrs[2] = " fadd s19, s19, s15"; indepInstrs[3] = " fadd s20, s20, s15"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/BranchBufferTest.cs ================================================ using System.Text; namespace AsmGen { public class BranchBufferTest : UarchTest { private bool mixNops; private bool initialDependentBranch; public BranchBufferTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "bob" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Branch Order Buffer Test (not-taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.mixNops = mixNops; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86GccAsm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { GenerateMipsAsm(sb); } } public void GenerateX86GccAsm(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) { string funcName = Prefix + Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x2, %r13"); sb.AppendLine(" mov $0x3, %r12"); sb.AppendLine(" mov $0x4, %r11"); sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_edi_target{fillerIdx}"; sb.AppendLine($" cmp %r14, %r11"); sb.AppendLine($" je {jumpLabel}"); // try to space the jumps out a bit if (this.mixNops) sb.AppendLine($" nop"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_esi_target{fillerIdx}"; sb.AppendLine($" cmp %r14, %r11"); sb.AppendLine($" je {jumpLabel}"); if (this.mixNops) sb.AppendLine($" nop"); // try to space the jumps out a bit sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } public void GenerateArmAsm(StringBuilder sb) { string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; for (int i = 0; i < Counts.Length; i++) { string funcName = Prefix + Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); sb.AppendLine(" mov x10, 6"); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w25_target{fillerIdx}"; sb.AppendLine($" cmp x15, x10"); sb.AppendLine($" b.eq {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w26_target{fillerIdx}"; sb.AppendLine($" cmp x15, x10"); sb.AppendLine($" b.eq {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } public void GenerateMipsAsm(StringBuilder sb) { StringBuilder ntJumpTargets = new StringBuilder(); for (int i = 0; i < Counts.Length; i++) { string initInstrs = " move $r15, $r0\n addi.d $r15, $r15, 15"; string funcName = this.Prefix + Counts[i]; // args in r4 = iterations, r5 = list, r6 = list (sink) // use r12 and r13 for ptr chasing loads, r14 as decrement for iteration count sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" ld.d $r12, $r5, 0"); sb.AppendLine(" ld.d $r13, $r5, 64"); sb.AppendLine(" xor $r14, $r14, $r14"); sb.AppendLine(" addi.d $r14, $r14, 1"); sb.AppendLine(initInstrs); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ld.d $r12, $r12, 0"); int fillerInstrCount = Counts[i]; for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { string jumpLabel = "dontenduphere_r12_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx; sb.AppendLine($" beqz $r15, {jumpLabel}"); ntJumpTargets.AppendLine(jumpLabel + ":"); ntJumpTargets.AppendLine(" jr $r1"); } sb.AppendLine(" ld.d $r13, $r13, 0"); for (int instrIdx = 0; instrIdx < fillerInstrCount; instrIdx++) { string jumpLabel = "dontenduphere_r13_" + this.Prefix + "_" + Counts[i] + "_" + instrIdx; sb.AppendLine($" beqz $r15, {jumpLabel}"); ntJumpTargets.AppendLine(jumpLabel + ":"); ntJumpTargets.AppendLine(" jr $r1"); } sb.AppendLine(" sub.d $r4, $r4, $r14"); sb.AppendLine(" bnez $r4, " + funcName + "start"); sb.AppendLine(" jr $r1"); } sb.AppendLine(ntJumpTargets.ToString()); } } } ================================================ FILE: AsmGen/tests/BranchHistoryTest.cs ================================================ using System.IO; using System.Text; namespace AsmGen { public class BranchHistoryTest : IUarchTest { public string Prefix { get; private set; } public string Description { get; private set; } public string FunctionDefinitionParameters { get; private set; } public string GetFunctionCallParameters { get; private set; } public bool DivideTimeByCount { get; private set; } private int[] branchCounts; private int[] historyCounts; public BranchHistoryTest() { Prefix = "branchhist"; Description = "Branch predictor pattern recognition"; FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen"; GetFunctionCallParameters = "structIterations"; DivideTimeByCount = true; branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512 }; historyCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768 }; } public bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb); if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); if (isa == IUarchTest.ISA.mips64) GenerateMipsAsm(sb); if (isa == IUarchTest.ISA.riscv) GenerateRiscvAsm(sb); } public void GenerateArmAsm(StringBuilder sb) { for (int i = 0; i < branchCounts.Length; i++) { string functionLabel = Prefix + branchCounts[i]; string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + functionLabel + ":"); sb.AppendLine(" sub sp, sp, #0x40"); sb.AppendLine(" stp x11, x12, [sp, #0x30]"); sb.AppendLine(" stp x15, x16, [sp, #0x20]"); sb.AppendLine(" stp x13, x14, [sp, #0x10]"); sb.AppendLine(" eor x16, x16, x16"); sb.AppendLine(" eor x15, x15, x15"); sb.AppendLine(" eor x12, x12, x12"); sb.AppendLine(" eor x11, x11, x11"); // w14 = branch index, w16 = pattern array index sb.AppendLine(loopLabel + ":"); sb.AppendLine(" eor w14, w14, w14"); // generate branch blocks for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++) { string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount; sb.AppendLine(" ldr x15, [x1, w14, uxtw #3]"); sb.AppendLine(" add w14, w14, 1"); sb.AppendLine(" ldr w13, [x15, w16, uxtw #2]"); sb.AppendLine($" cbnz x13, {jumpTarget}"); sb.AppendLine(" add x12, x12, 1"); sb.AppendLine(jumpTarget + ":"); } // increment w16, and basically cmov 0 -> w16 if w16 = list length sb.AppendLine(" add w16, w16, 1"); sb.AppendLine(" cmp w16, w2"); sb.AppendLine(" csel w16, w11, w16, EQ"); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine($" cbnz x0, {loopLabel}"); sb.AppendLine(" mov x0, x12"); sb.AppendLine(" ldp x11, x12, [sp, #0x30]"); sb.AppendLine(" ldp x15, x16, [sp, #0x20]"); sb.AppendLine(" ldp x13, x14, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x40"); sb.AppendLine(" ret"); } } public void GenerateX86GccAsm(StringBuilder sb) { for (int i = 0; i < branchCounts.Length; i++) { string functionLabel = Prefix + branchCounts[i]; sb.AppendLine("\n" + functionLabel + ":"); sb.AppendLine(" push %rbx"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %r9"); sb.AppendLine(" xor %rbx, %rbx"); sb.AppendLine(" xor %r8, %r8"); sb.AppendLine(" xor %r9, %r9"); string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + loopLabel + ":"); sb.AppendLine(" xor %r11, %r11"); // set index into arr of arrs to 0 for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++) { sb.AppendLine(" mov (%rsi,%r11,8), %r10"); // load array base pointer into r10 sb.AppendLine(" inc %r11"); sb.AppendLine(" mov (%r10,%rbx,4), %eax "); // read element from branch history test array sb.AppendLine(" test %eax, %eax"); // conditional branch on test array value string zeroLabel = Prefix + branchCounts[i] + "_zero" + branchCount; sb.AppendLine(" jz " + zeroLabel); sb.AppendLine(" inc %r8"); // r8 is just a sink here sb.AppendLine(zeroLabel + ":"); } // loop around in pattern history test array if necessary // avoiding an extra branch to not pollute BPU history sb.AppendLine(" inc %rbx"); sb.AppendLine(" cmp %rbx, %rdx"); sb.AppendLine(" cmove %r9, %rbx"); // end of main loop over iteration count sb.AppendLine(" dec %rdi"); sb.AppendLine(" jnz " + loopLabel); // function epilogue sb.AppendLine(" mov %r8, %rax"); sb.AppendLine(" pop %r9"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %rbx"); sb.AppendLine(" ret"); } } public void GenerateMipsAsm(StringBuilder sb) { // Generate an array of branch history test functions, one for each branch count for (int i = 0; i < branchCounts.Length; i++) { // branchtestFunc(iterations, testArrToArr, historyLen) // r4 = iterations, r5 = array of pointers to pattern arrays for each branch, r6 = history length (length of each array) // temporary registers: r12-r20 // write code here string functionLabel = Prefix + branchCounts[i]; sb.AppendLine("\n" + functionLabel + ":"); // r12 = branch index, r13 = index into pattern array sb.AppendLine(" move $r13, $r0"); sb.AppendLine(" move $r18, $r0"); sb.AppendLine(" move $r20, $r0"); sb.AppendLine(" addi.d $r20, $r20, 1"); string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + loopLabel + ":"); sb.AppendLine(" move $r12, $r0"); // set branch index to zero // generate branch blocks for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++) { string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount; // load the branch's pattern array sb.AppendLine(" alsl.d $r14, $r12, $r0, 0x3"); // get offset into array in bytes, using r12 as array index. sb.AppendLine(" add.d $r14, $r14, $r5"); // get address into r14 sb.AppendLine(" ld.d $r15, $r14, 0"); // r15 = base address of curent branch's target array sb.AppendLine(" addi.d $r12, $r12, 1"); // next branch // load element from pattern array indicating where we should branch sb.AppendLine(" alsl.d $r16, $r13, $r0, 0x2"); // use r13 to index into pattern array sb.AppendLine(" add.d $r16, $r16, $r15"); // r16 = address of element we want to load sb.AppendLine(" ld.w $r17, $r16, 0"); sb.AppendLine($" bnez $r17, {jumpTarget}"); // branch if 1 sb.AppendLine(" addi.d $r18, $r18, 1"); sb.AppendLine(jumpTarget + ":"); } // increment w16, and basically cmov 0 -> w16 if w16 = list length // increment r13 (idx into pattern array) sb.AppendLine(" addi.d $r13, $r13, 1"); sb.AppendLine(" sub.d $r19, $r6, $r13"); // r19 = history length - index sb.AppendLine(" maskeqz $r13, $r13, $r19"); // set index back to 0 to repeat pattern, if history length - index == 0 sb.AppendLine(" sub.d $r4, $r4, $r20"); // decrement iteration count sb.AppendLine($" bnez $r4, {loopLabel}"); sb.AppendLine(" move $r4, $r18"); // return the count of NT branches for tracking RNG quality sb.AppendLine(" jr $r1"); } } public void GenerateRiscvAsm(StringBuilder sb) { // Generate an array of branch history test functions, one for each branch count for (int i = 0; i < branchCounts.Length; i++) { // branchtestFunc(iterations, testArrToArr, historyLen) // a0 = iterations, a1 = array of pointers to pattern arrays for each branch, a2 = length of each array (history length) // t0-t7 temporary registers // write code here string functionLabel = Prefix + branchCounts[i]; sb.AppendLine("\n" + functionLabel + ":"); sb.AppendLine(" addi sp, sp, -16"); sb.AppendLine(" sd s0, (sp)"); // t1 = index into pattern array sb.AppendLine(" li t1, 0"); sb.AppendLine(" li t6, 0"); string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + loopLabel + ":"); sb.AppendLine(" mv t2, a1"); // start of array of pointers to pattern arrays // generate branchCount blocks, each of which traverses its own array for (int branchCount = 0; branchCount < branchCounts[i]; branchCount++) { string jumpTarget = functionLabel + branchCounts[i] + "_zero" + branchCount; // load the branch's pattern array (a1 -> ptr -> array) sb.AppendLine(" ld t3, (t2)"); // load pointer to array // t3 = base address of branch's array sb.AppendLine(" slli t4, t1, 2"); sb.AppendLine(" add t4, t4, t3"); sb.AppendLine(" lw t5, (t4)"); // should have 1 or 0 sb.AppendLine(" addi t2, t2, 8"); // next branch sb.AppendLine($" beq t5, x0, {jumpTarget}"); sb.AppendLine(" addi t6, t6, 1"); // dummy increment to track not-taken/taken branch ratio sb.AppendLine(jumpTarget + ":"); } sb.AppendLine(" addi t1, t1, 1"); // increment array index sb.AppendLine(" slt s0, t1, a2"); // 1 if within range sb.AppendLine(" mul t1, t1, s0"); // multiply by 1 if within range, 0 otherwise // decrement iteration count sb.AppendLine(" addi a0, a0, -1"); sb.AppendLine($" bne a0, x0, {loopLabel}"); sb.AppendLine(" mv a0, t6"); sb.AppendLine(" ld s0, (sp)"); sb.AppendLine(" addi sp, sp, 16"); sb.AppendLine(" ret"); } } public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa) { sb.AppendLine(" if (argc > 1 && strcmp(test_name, \"" + Prefix + "\") == 0) {"); sb.AppendLine(" printf(\"" + Description + ":\\n\");"); GenerateCommonTestBlock(sb); sb.AppendLine(" }\n"); } public void GenerateAsmGlobalLines(StringBuilder sb) { for (int i = 0; i < branchCounts.Length; i++) sb.AppendLine(".global " + Prefix + branchCounts[i]); } // kinda hack this to put in initialization code we need public void GenerateExternLines(StringBuilder sb) { for (int i = 0; i < branchCounts.Length; i++) sb.AppendLine("extern uint64_t " + Prefix + branchCounts[i] + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));"); GenerateInitializationCode(sb, true); string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccBranchHistFunction.c")); sb.AppendLine(gccFunction); } public void GenerateInitializationCode(StringBuilder sb, bool gcc) { sb.AppendLine($"uint32_t maxBranchCount = {branchCounts.Length};"); sb.Append($"uint32_t branchCounts[{branchCounts.Length}] = "); sb.Append("{ " + branchCounts[0]); for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]); sb.AppendLine(" };"); sb.Append($"uint32_t branchHistoryLengths[{historyCounts.Length}] = "); sb.Append("{ " + historyCounts[0]); for (int i = 1; i < historyCounts.Length; i++) sb.Append(", " + historyCounts[i]); sb.AppendLine(" };"); if (gcc) sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);"); else sb.AppendLine($"uint64_t (*branchtestFuncArr[{branchCounts.Length}])(uint64_t iterations, uint32_t **arr, uint32_t arrLen);"); sb.AppendLine("void initializeBranchHistFuncArr() {"); for (int i = 0; i < branchCounts.Length; i++) { sb.AppendLine($" branchtestFuncArr[{i}] = {Prefix + branchCounts[i]};"); } sb.AppendLine("}"); } public void GenerateCommonTestBlock(StringBuilder sb) { string branchhistMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "BranchhistTestBlock.c")); sb.AppendLine(branchhistMain); } } } ================================================ FILE: AsmGen/tests/BtbTest.cs ================================================ using System; using System.Text; namespace AsmGen { public class BtbTest : UarchTest { private int spacing; private BranchType branchType; private bool varyspacing; public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public enum BranchType { /// /// Conditional branches that are always taken /// Conditional, /// /// Unconditional jmps /// Unconditional, /// /// A mix of both to max out Zen 2's BTB capacity /// Optimization guide says one entry can track two branches if they're in the same 64B line /// and the first is conditional /// ZenMix } /// /// Constructor for BTB test /// /// How far apart branches should be. Valid values are 4, 8, 16 /// If true, use conditional branches (still always taken) public BtbTest(int spacing, BranchType branchType, bool varyspacing = false) { this.Counts = new int[] { 1, 2, 4, 8, 16, 32, 48, 56, 64, 128, 256, 512, 768, 1024, 1536, 2048, 3072, 4096, 4608, 5120, 6144, 7168, 8192, 10240, 12288, 14336, 16384, 20480, 24576, 28672, 32768, 40960, 49152 }; this.Prefix = "btb" + spacing + (varyspacing ? "v" : "") + branchType; this.Description = $"Branch Target Buffer, " + branchType + $" branch every {spacing} bytes " + (varyspacing ? " (varied spacing)" : ""); this.FunctionDefinitionParameters = "uint64_t iterations"; this.GetFunctionCallParameters = "structIterations"; this.DivideTimeByCount = true; this.spacing = spacing; this.branchType = branchType; this.varyspacing = varyspacing; } private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; } public string GetLabelName(string funcName, int part) { return funcName + "part" + part; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86GccAsm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } else if (isa == IUarchTest.ISA.mips64) { GenerateMipsAsm(sb); } else if (isa == IUarchTest.ISA.riscv) { GenerateRiscvAsm(sb); } } public void GenerateX86GccAsm(StringBuilder sb) { string paddingAlign = " .align " + spacing; int spacingNops = 0; for (int i = 0; i < Counts.Length; i++) { string funcName = GetBranchFuncName(Counts[i]); //sb.AppendLine("; Start of function for branch count " + branchCounts[i] + " padding " + paddings[p]); sb.AppendLine(funcName + ":\n"); sb.AppendLine(" xor %rax, %rax"); if (branchType == BranchType.ZenMix) sb.AppendLine(" .align 64"); for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++) { string labelName = GetLabelName(funcName, branchIdx); if (branchType == BranchType.Conditional) { sb.AppendLine(" test %rax, %rax"); sb.AppendLine(" jz " + labelName); // should always be set } else if (branchType == BranchType.Unconditional) { sb.AppendLine(" jmp " + labelName); } else if (branchType == BranchType.ZenMix) { if ((branchIdx & 0x1) == 0) { sb.AppendLine(" jmp " + labelName); } else { sb.AppendLine(" test %rax, %rax"); sb.AppendLine(" jz " + labelName); } } sb.AppendLine(paddingAlign); if (varyspacing) { for (int nopIdx = 0; nopIdx < spacingNops; nopIdx++) { sb.AppendLine(" nop"); } spacingNops++; if (spacingNops > 6) spacingNops = 0; } sb.AppendLine(labelName + ":"); } sb.AppendLine(" dec %rdi"); sb.AppendLine(" jne " + funcName); sb.AppendLine(" ret\n\n"); // don't let it get too close to the next branch sb.AppendLine(paddingAlign); } } private string Get4BNopAlign() { string paddingAlign = ""; if (spacing == 8) { paddingAlign = " nop"; } else if (spacing == 16) { paddingAlign = " nop\n nop\n nop"; } else if (spacing == 32) { paddingAlign = " nop\n nop\n nop\n nop\n nop\n nop\n nop"; } else if (spacing == 64) { paddingAlign = " nop\n nop\n nop\n nop\n nop\n nop\n nop\n"; paddingAlign += " nop\n nop\n nop\n nop\n nop\n nop\n nop\n nop"; } else if (spacing != 4) { Console.WriteLine($"Unsupported padding value {spacing}"); throw new NotImplementedException("Unsupported padding value"); } return paddingAlign; } public void GenerateArmAsm(StringBuilder sb) { // things are 4 bytes on aarch64 string paddingAlign = Get4BNopAlign(); for (int i = 0; i < Counts.Length; i++) { string funcName = GetBranchFuncName(Counts[i]); string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget"; sb.AppendLine(funcName + ":"); sb.AppendLine($" adrp x2, {funcName}"); sb.AppendLine($" add x2, x2, :lo12:{funcName}"); sb.AppendLine(" mov x1, 1"); sb.AppendLine(".align 16"); sb.AppendLine(funcTargetName + ":"); for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++) { string labelName = GetLabelName(funcName, branchIdx); if (branchType == BranchType.Unconditional) sb.AppendLine(" b " + labelName); else if (branchType == BranchType.Conditional) sb.AppendLine(" cbnz x1, " + labelName); // x1 = 1 from earlier, should never be zero else if (branchType == BranchType.ZenMix) { if ((branchIdx & 0x1) == 0) sb.AppendLine(" b " + labelName); else sb.AppendLine(" cbnz x1, " + labelName); } sb.AppendLine(paddingAlign); sb.AppendLine(labelName + ":"); } sb.AppendLine(paddingAlign); sb.AppendLine(" sub x0, x0, 1"); // aarch64 is a mess. try to avoid 'relocation truncated to fit' issues with an indirect branch if (spacing * Counts[i] >= (1024 * 1024 - 20)) { string workaroundTarget = funcName + "_aarch64_indirect_workaround"; // jump over indirect branch to return, on zero // this branch should be not taken for all except the last iteration, and should have minimal // impact on results because a predicted NT branch is sort of 'free' on most architectures sb.AppendLine(" cbz x0, " + workaroundTarget); sb.AppendLine(" br x2"); sb.AppendLine(workaroundTarget + ":"); } else { sb.AppendLine(" cbnz x0, " + funcTargetName); } sb.AppendLine(" ret\n\n"); // don't let it get too close to the next branch sb.AppendLine(paddingAlign); } } public void GenerateMipsAsm(StringBuilder sb) { string paddingAlign = Get4BNopAlign(); for (int i = 0; i < Counts.Length; i++) { string funcName = GetBranchFuncName(Counts[i]); string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget"; sb.AppendLine(funcName + ":"); sb.AppendLine(" xor $r12, $r12, $r12"); sb.AppendLine(" addi.d $r12, $r12, 1"); sb.AppendLine(" xor $r13, $r13, $r13"); sb.AppendLine(" la $r14, " + funcTargetName); sb.AppendLine(funcTargetName + ":"); for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++) { string labelName = GetLabelName(funcName, branchIdx); sb.AppendLine(" beqz $r13, " + labelName); sb.AppendLine(paddingAlign); sb.AppendLine(labelName + ":"); } sb.AppendLine(" sub.d $r4, $r4, $r12"); // decrement iteration count int distance = spacing * Counts[i]; if (distance < 1024) { sb.AppendLine(" bnez $r4, " + funcTargetName); // short branch if we're not too far away } else { string workaroundTarget = funcName + "_mips_indirect_workaround"; sb.AppendLine(" beqz $r4, " + workaroundTarget); // jump over indirect branch if iteration count is reached sb.AppendLine(" jr $r14"); // jump back to target (start of loop) sb.AppendLine(workaroundTarget + ":"); } sb.AppendLine(" jr $r1"); } } private string GetRiscvNopAlign() { // branch takes 16 bits (2 bytes) int paddingNeeded = spacing - 2; // each NOP is 2 bytes StringBuilder nopSb = new StringBuilder(); for (int i = 0; i < paddingNeeded; i += 2) { nopSb.AppendLine(" nop"); } return nopSb.ToString(); } public void GenerateRiscvAsm(StringBuilder sb) { string paddingAlign = GetRiscvNopAlign(); for (int i = 0; i < Counts.Length; i++) { string funcName = GetBranchFuncName(Counts[i]); string funcTargetName = GetBranchFuncName(Counts[i]) + "_itarget"; sb.AppendLine(funcName + ":"); sb.AppendLine(" la x5, " + funcTargetName); sb.AppendLine(funcTargetName + ":"); for (int branchIdx = 1; branchIdx < Counts[i]; branchIdx++) { string labelName = GetLabelName(funcName, branchIdx); sb.AppendLine(" j " + labelName); sb.AppendLine(paddingAlign); sb.AppendLine(labelName + ":"); } sb.AppendLine(" addi x10, x10, -1"); // decrement iteration count int distance = spacing * Counts[i]; if (distance < 1024) { sb.AppendLine(" bne x10, x0, " + funcTargetName); // short branch if we're not too far away } else { string workaroundTarget = funcName + "_riscv_indirect_workaround"; sb.AppendLine(" beq x10, x0, " + workaroundTarget); // jump over indirect branch if iteration count is reached sb.AppendLine(" jalr x0, x5"); // jump back to target (start of loop) sb.AppendLine(workaroundTarget + ":"); } sb.AppendLine(" ret"); } } } } ================================================ FILE: AsmGen/tests/CvtSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class CvtSchedTest : UarchTest { public CvtSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "cvtsched"; this.Description = "F2I Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " cvtsi2ss %rdi, %xmm1"; unrolledInstrs[1] = " cvtsi2ss %rdi, %xmm2"; unrolledInstrs[2] = " cvtsi2ss %rdi, %xmm3"; unrolledInstrs[3] = " cvtsi2ss %rdi, %xmm4"; string[] unrolledInstrs1 = new string[4]; unrolledInstrs1[0] = " cvtsi2ss %rsi, %xmm1"; unrolledInstrs1[1] = " cvtsi2ss %rsi, %xmm2"; unrolledInstrs1[2] = " cvtsi2ss %rsi, %xmm3"; unrolledInstrs1[3] = " cvtsi2ss %rsi, %xmm4"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " scvtf s0, w25"; unrolledInstrs[1] = " scvtf s1, w25"; unrolledInstrs[2] = " scvtf s2, w25"; unrolledInstrs[3] = " scvtf s3, w25"; string[] unrolledInstrs1 = new string[4]; unrolledInstrs1[0] = " scvtf s0, w26"; unrolledInstrs1[1] = " scvtf s1, w26"; unrolledInstrs1[2] = " scvtf s2, w26"; unrolledInstrs1[3] = " scvtf s3, w26"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs1); } else if (isa == IUarchTest.ISA.mips64) { } else if (isa == IUarchTest.ISA.riscv) { } } } } ================================================ FILE: AsmGen/tests/FAdd256RfTest.cs ================================================ using System.Text; namespace AsmGen { public class Fadd256RfTest : UarchTest { public enum TestMode { none, setavx512regs, pendingavx512instr } private bool populateAvx512Regs; private bool pendingAvx512Instr; public Fadd256RfTest(int low, int high, int step, TestMode mode) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fadd256rf" + mode; this.Description = "256-bit FP/vector RF capacity, " + mode; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; if (mode == TestMode.setavx512regs) populateAvx512Regs = true; else if (mode == TestMode.pendingavx512instr) pendingAvx512Instr = true; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vmovups (%r8), %ymm0\n" + " vmovups %ymm0, %ymm1\n" + " vmovups %ymm0, %ymm2\n" + " vmovups %ymm0, %ymm3\n" + " vmovups %ymm0, %ymm4\n"; if (this.populateAvx512Regs) { for (int i = 5; i < 32; i++) { initInstrs += " vmovups 64(%r8), %zmm" + i + "\n"; } } string postLoadInstr = string.Empty; if (this.pendingAvx512Instr) { initInstrs += " vmovups 64(%r8), %zmm5\n vmovups 128(%r8), %zmm6\n"; postLoadInstr = " vaddps %zmm5, %zmm6, %zmm6"; } string[] unrolledAdds = new string[4]; unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1"; unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2"; unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3"; unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr); } else if (isa == IUarchTest.ISA.aarch64) { } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = ""; for (int regIdx = 0; regIdx < 32; regIdx++) { initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n"; } string[] unrolledAdds = new string[4]; unrolledAdds[0] = " xvfadd.s $xr1, $xr1, $xr1"; unrolledAdds[1] = " xvfadd.s $xr2, $xr2, $xr2"; unrolledAdds[2] = " xvfadd.s $xr3, $xr3, $xr3"; unrolledAdds[3] = " xvfadd.s $xr4, $xr4, $xr4"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs); } } } } ================================================ FILE: AsmGen/tests/Fadd128RfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class Fadd128RfTest : UarchTest { private bool initialDependentBranch; public Fadd128RfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fadd128rf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "128-bit FP/vector RF capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return false; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vmovups (%r8), %ymm0\n"; for (int i = 1; i < 16; i++) initInstrs += $" vmovups %ymm0, %ymm{i}\n"; List unrolledAddsList = new List(); for (int i = 1; i < 16; i++) unrolledAddsList.Add($" vaddps %ymm0, %ymm{i}, %ymm{i}"); string[] unrolledAdds = unrolledAddsList.ToArray(); UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string initInstrs = " ldr q0, [x1]\n" + " ldr q1, [x1, #0x10]\n" + " ldr q2, [x1, #0x20]\n" + " ldr q3, [x1, #0x30]\n" + " ldr q4, [x1, #0x40]\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s"; unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s"; unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s"; unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.riscv) { string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n vlw.v v1, (a1)\n vlw.v v2, (a1)\n vlw.v v3, (a1)"; string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty; postLoadInstrs += "\n mv t6, a2"; string[] unrolledInstrs = new string[1]; unrolledInstrs[0] = " vfadd.vv v0, v0, v0"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/Fadd128SchedTest.cs ================================================ using System.Text; namespace AsmGen { public class Fadd128SchedTest : UarchTest { public Fadd128SchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fadd128sched"; this.Description = "128-bit Vector FP Add Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " addps %xmm0, %xmm1"; unrolledAdds[1] = " addps %xmm0, %xmm2"; unrolledAdds[2] = " addps %xmm0, %xmm3"; unrolledAdds[3] = " addps %xmm0, %xmm4"; UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr q0, [x2, w25, uxtw#0]"; string postLoadInstrs2 = " ldr q0, [x2, w26, uxtw#0]"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s"; unrolledAdds[1] = " add v2.4s, v2.4s, v0.4s"; unrolledAdds[2] = " add v3.4s, v3.4s, v0.4s"; unrolledAdds[3] = " add v4.4s, v4.4s, v0.4s"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, null, postLoadInstrs1, postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/Fadd256SchedTest.cs ================================================ using System.Text; namespace AsmGen { public class Fadd256SchedTest : UarchTest { public Fadd256SchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fadd256sched"; this.Description = "256-bit FP add scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // ymm0 is dependent on ptr chasing load string[] unrolledAdds = new string[4]; unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1"; unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2"; unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3"; unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3"; UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } else if (isa == IUarchTest.ISA.aarch64) { } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = ""; for (int regIdx = 0; regIdx < 32; regIdx++) { initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n"; } initInstrs += " move $r16, $r0\n addi.d $r16, $r16, 0xF"; // load mask into r16 string postLoadInstrs1 = " and $r15, $r12, $r16\n xvldx $xr1, $r6, $r15"; string postLoadInstrs2 = " and $r15, $r13, $r16\n xvldx $xr1, $r6, $r15"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " xvfadd.s $xr2, $xr2, $xr1"; unrolledAdds[1] = " xvfadd.s $xr3, $xr3, $xr1"; unrolledAdds[2] = " xvfadd.s $xr4, $xr4, $xr1"; unrolledAdds[3] = " xvfadd.s $xr5, $xr5, $xr1"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/FaddNsq.cs ================================================ using System.Text; namespace AsmGen { public class FaddNsq : UarchTest { private int totalOps; public FaddNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "faddnsq" + totalOps; this.Description = "FADD, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string postLoadInstrs = " cvtsi2ss %edi, %xmm1"; string initInstrs = " cvtsi2ss %r12, %xmm2"; string[] depInstrs = new string[4]; depInstrs[0] = " addss %xmm1, %xmm0"; depInstrs[1] = " addss %xmm1, %xmm3"; depInstrs[2] = " addss %xmm1, %xmm4"; depInstrs[3] = " addss %xmm1, %xmm5"; string[] indepInstrs = new string[2]; indepInstrs[0] = " addss %xmm2, %xmm6"; indepInstrs[1] = " addss %xmm2, %xmm7"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]"; string initInstrs = " ldr s15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " fadd s0, s0, s16"; depInstrs[1] = " fadd s1, s1, s16"; depInstrs[2] = " fadd s2, s2, s16"; depInstrs[3] = " fadd s3, s3, s16"; string[] indepInstrs = new string[4]; indepInstrs[0] = " fadd s17, s17, s15"; indepInstrs[1] = " fadd s18, s18, s15"; indepInstrs[2] = " fadd s19, s19, s15"; indepInstrs[3] = " fadd s20, s20, s15"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/FaddSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class FaddSchedTest : UarchTest { public FaddSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "faddsched"; this.Description = "FP Add Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " addss %xmm0, %xmm1"; unrolledAdds[1] = " addss %xmm0, %xmm2"; unrolledAdds[2] = " addss %xmm0, %xmm3"; unrolledAdds[3] = " addss %xmm0, %xmm4"; UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd s17, s17, s16"; unrolledAdds[1] = " fadd s18, s18, s16"; unrolledAdds[2] = " fadd s19, s19, s16"; unrolledAdds[3] = " fadd s20, s20, s16"; UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = " fld.s $f8, $r6, 0\n" + " fld.s $f9, $r6, 4\n" + " fld.s $f10, $r6, 8\n" + " fld.s $f11, $r6, 12\n" + " fld.s $f12, $r6, 16\n"; string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0"; string[] dependentAdds = new string[4]; dependentAdds[0] = " fadd.s $f9, $f9, $f8"; dependentAdds[1] = " fadd.s $f10, $f10, $f8"; dependentAdds[2] = " fadd.s $f11, $f11, $f8"; dependentAdds[3] = " fadd.s $f12, $f12, $f8"; string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6\n fld.s $f8, $r19, 0"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, dependentAdds, dependentAdds, includePtrChasingLoads: false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } else if (isa == IUarchTest.ISA.riscv) { string initInstrs = " fld f0, (x12)\n" + " fld f1, 8(x12)\n" + " fld f2, 16(x12)\n" + " fld f3, 24(x12)\n" + " fld f4, 32(x12)\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd.s f0, f0, f4"; unrolledAdds[1] = " fadd.s f1, f1, f4"; unrolledAdds[2] = " fadd.s f2, f2, f4"; unrolledAdds[3] = " fadd.s f3, f3, f4"; string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12\n fld f4, (x7)"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12\n fld f4, (x7)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1, postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/FcmpSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class FcmpSchedTest : UarchTest { public FcmpSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fcmpsched"; this.Description = "FCMP Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fcmp s17, s16"; unrolledAdds[1] = " fcmp s19, s16"; unrolledAdds[2] = " fcmp s19, s16"; unrolledAdds[3] = " fcmp s20, s16"; UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } } } } ================================================ FILE: AsmGen/tests/FlagRfTest.cs ================================================ using System.Text; namespace AsmGen { public class FlagRfTest : UarchTest { private bool initialDependentBranch; public FlagRfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "flagrf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Flags Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return false; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[1]; unrolledAdds[0] = " test %r15, %r14"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] unrolledAdds = new string[1]; unrolledAdds[0] = " cmp x14, x15"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/Fma256SchedTest.cs ================================================ using System.Text; namespace AsmGen { public class Fma256SchedTest : UarchTest { public Fma256SchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fma256sched"; this.Description = "256-bit FP add scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return false; if (isa == IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // ymm0 is dependent on ptr chasing load string[] unrolledAdds = new string[4]; unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1"; unrolledAdds[1] = " vaddps %ymm0, %ymm2, %ymm2"; unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3"; unrolledAdds[3] = " vaddps %ymm0, %ymm4, %ymm3"; UarchTestHelpers.GenerateX86AsmFp256SchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } else if (isa == IUarchTest.ISA.aarch64) { } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = ""; for (int regIdx = 0; regIdx < 32; regIdx++) { initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n"; } initInstrs += " move $r16, $r0\n addi.d $r16, $r16, 0xF"; // load mask into r16 string postLoadInstrs1 = " and $r15, $r12, $r16\n xvldx $xr1, $r6, $r15"; string postLoadInstrs2 = " and $r15, $r13, $r16\n xvldx $xr1, $r6, $r15"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " xvfmadd.s $xr2, $xr2, $xr2, $xr1"; unrolledAdds[1] = " xvfmadd.s $xr3, $xr3, $xr3, $xr1"; unrolledAdds[2] = " xvfmadd.s $xr4, $xr4, $xr4, $xr1"; unrolledAdds[3] = " xvfmadd.s $xr5, $xr5, $xr5, $xr1"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/FmovSched.cs ================================================ using System.Text; namespace AsmGen { public class FmovSched : UarchTest { public FmovSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fmovsched"; this.Description = "FMOV vec to gpr Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]"; string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " fmov x15, d16"; unrolledInstrs[1] = " fmov x14, d16"; unrolledInstrs[2] = " fmov x13, d16"; unrolledInstrs[3] = " fmov x12, d16"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/FmulSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class FmulSchedTest : UarchTest { public FmulSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fmulsched"; this.Description = "FP (32-bit multiply) Scheduler Capacity Test"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86Asm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } else if (isa == IUarchTest.ISA.riscv) { GenerateRiscvAsm(sb); } } public void GenerateX86Asm(StringBuilder sb) { // xmm0 is dependent on ptr chasing load string[] unrolledAdds = new string[4]; unrolledAdds[0] = " mulss %xmm0, %xmm1"; unrolledAdds[1] = " mulss %xmm0, %xmm2"; unrolledAdds[2] = " mulss %xmm0, %xmm3"; unrolledAdds[3] = " mulss %xmm0, %xmm4"; UarchTestHelpers.GenerateX86AsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } public void GenerateArmAsm(StringBuilder sb) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fmul s17, s17, s16"; unrolledAdds[1] = " fmul s18, s18, s16"; unrolledAdds[2] = " fmul s19, s19, s16"; unrolledAdds[3] = " fmul s20, s20, s16"; UarchTestHelpers.GenerateArmAsmFpSchedTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds); } public void GenerateRiscvAsm(StringBuilder sb) { string initInstrs = " fld f0, (x12)\n" + " fld f1, 8(x12)\n" + " fld f2, 16(x12)\n" + " fld f3, 24(x12)\n" + " fld f4, 32(x12)\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fmul.s f0, f0, f4"; unrolledAdds[1] = " fmul.s f1, f1, f4"; unrolledAdds[2] = " fmul.s f2, f2, f4"; unrolledAdds[3] = " fmul.s f3, f3, f4"; string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12\n fld f4, (x7)"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12\n fld f4, (x7)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1, postLoadInstrs2); } } } ================================================ FILE: AsmGen/tests/FpRfTest.cs ================================================ using System.Text; namespace AsmGen { public class FpRfTest : UarchTest { private bool initialDependentBranch; public FpRfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fprf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " movss (%r8), %xmm1\n" + " movss 4(%r8), %xmm2\n" + " movss 8(%r8), %xmm3\n" + " movss 12(%r8), %xmm4\n" + " movss 16(%r8), %xmm5\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " addss %xmm1, %xmm2"; unrolledAdds[1] = " addss %xmm1, %xmm3"; unrolledAdds[2] = " addss %xmm1, %xmm4"; unrolledAdds[3] = " addss %xmm1, %xmm5"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string initInstrs = " ldr s17, [x2]\n" + " ldr s18, [x2, 4]\n" + " ldr s19, [x2, 8]\n" + " ldr s20, [x2, 12]\n" + " ldr s21, [x2, 16]\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd s18, s18, s17"; unrolledAdds[1] = " fadd s19, s19, s17"; unrolledAdds[2] = " fadd s20, s20, s17"; unrolledAdds[3] = " fadd s21, s21, s17"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = " fld.s $f8, $r6, 0\n" + " fld.s $f9, $r6, 4\n" + " fld.s $f10, $r6, 8\n" + " fld.s $f11, $r6, 12\n" + " fld.s $f12, $r6, 16\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd.s $f9, $f9, $f8"; unrolledAdds[1] = " fadd.s $f10, $f10, $f8"; unrolledAdds[2] = " fadd.s $f11, $f11, $f8"; unrolledAdds[3] = " fadd.s $f12, $f12, $f8"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs); } else if (isa == IUarchTest.ISA.riscv) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null; if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); string initInstrs = " fld f0, (x12)\n" + " fld f1, 8(x12)\n" + " fld f2, 16(x12)\n" + " fld f3, 24(x12)\n" + " fld f4, 32(x12)\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd.s f0, f0, f4"; unrolledAdds[1] = " fadd.s f1, f1, f4"; unrolledAdds[2] = " fadd.s f2, f2, f4"; unrolledAdds[3] = " fadd.s f3, f3, f4"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); } } } } ================================================ FILE: AsmGen/tests/FpStoreDataNsq.cs ================================================ using System.Text; namespace AsmGen { public class FpStoreDataNsqTest : UarchTest { public FpStoreDataNsqTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fpstoredatansq" + high; this.Description = "Store FP 32-bit data scheduler capacity, excluding nsq"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vzeroupper\n vpcmpeqd %xmm2, %xmm2, %xmm2\n vpxor %xmm2, %xmm3, %xmm3\n cvtsi2ss %r11, %xmm3\n movss %xmm3, %xmm4\n movss %xmm3, %xmm5\n movss %xmm3, %xmm6"; string postLoadInstr = " cvtsi2ss %rdi, %xmm1"; string[] dependentStores = new string[4]; dependentStores[0] = " movss %xmm1, (%r8)"; dependentStores[1] = " movss %xmm1, (%r8, %r14, 4)"; dependentStores[2] = " movss %xmm1, (%r8, %r13, 4)"; dependentStores[3] = " movss %xmm1, (%r8, %r12, 4)"; string[] indepFpInstrs = new string[4]; indepFpInstrs[0] = " addss %xmm2, %xmm3"; indepFpInstrs[1] = " addss %xmm2, %xmm4"; indepFpInstrs[2] = " addss %xmm2, %xmm5"; indepFpInstrs[3] = " addss %xmm2, %xmm6"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepFpInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstr); } } } } ================================================ FILE: AsmGen/tests/IdrfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class IdrfTest : UarchTest { public IdrfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "idrf"; this.Description = "Immediate/Displacement Register File"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { const string dummyBranchTargetName = "idrftest_badtarget"; if (isa == IUarchTest.ISA.amd64) { const int storeCount = 40; const int addCount = 130; List testInstructions = new List(); int storeIdx = 0, addIdx = 0; for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++) { if (addIdx < addCount) { string addInstr = " add $" + (i + 1) + ", %r" + (12 + (i % 4)); testInstructions.Add(addInstr); addIdx++; } else if (storeIdx < storeCount) { string storeInstr = " mov %r11d, " + +(((i + 1) & 0xFF) * 4) + "(%r8)"; testInstructions.Add(storeInstr); storeIdx++; } else { string branchInstr = $" test %r11, %r11\n je {dummyBranchTargetName}"; testInstructions.Add(branchInstr); } } string[] unrolledAdds = testInstructions.ToArray(); UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); sb.AppendLine($"{dummyBranchTargetName}:\n int3"); } } } } ================================================ FILE: AsmGen/tests/IndirectBranchTest.cs ================================================ using System.Text; using System.IO; namespace AsmGen { public class IndirectBranchTest : IUarchTest { private int[] branchCounts; private int[] targetCounts; private int globalHistoryAssistBits; private bool assists; public IndirectBranchTest(bool assist) { Prefix = "indirectbranch"; Description = "Indirect branch prediction"; FunctionDefinitionParameters = "uint64_t iterations, uint32_t **arr, uint32_t arrLen, uint64_t **scratch"; DivideTimeByCount = true; branchCounts = new int[] { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 }; targetCounts = new int[] { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 160, 192, 256, 384, 512 }; globalHistoryAssistBits = 4; this.assists = assist; } public bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; return false; } public void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86GccAsm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } else if (isa == IUarchTest.ISA.mips64) { GenerateMipsAsm(sb); } } private string GetFunctionName(int branchCount, int targetCount) { return Prefix + branchCount + "targets" + targetCount; } private string GetTargetLabelName(int branchCount, int targetCount, int branchIndex, int targetIndex) { return GetFunctionName(branchCount, targetCount) + "branch" + branchIndex + "target" + targetIndex; } public void GenerateArmAsm(StringBuilder sb) { for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) { int currentTargetCount = targetCounts[targetCountIdx]; for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++) { int currentBranchCount = branchCounts[branchCountIdx]; string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount); string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + functionLabel + ":"); sb.AppendLine(" sub sp, sp, #0x60"); sb.AppendLine(" stp x17, x18, [sp, #0x40]"); sb.AppendLine(" stp x9, x10, [sp, #0x40]"); sb.AppendLine(" stp x11, x12, [sp, #0x30]"); sb.AppendLine(" stp x15, x16, [sp, #0x20]"); sb.AppendLine(" stp x13, x14, [sp, #0x10]"); sb.AppendLine(" eor x16, x16, x16"); sb.AppendLine(" eor x15, x15, x15"); sb.AppendLine(" eor x14, x14, x14"); sb.AppendLine(" eor x12, x12, x12"); sb.AppendLine(" eor x11, x11, x11"); // fill in jump tables for every branch. there has to be a better way to do this for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { // x3 = array of ptrs to jump tables // x14 = index into array of jump tables // x17 = ptr to jump table sb.AppendLine(" ldr x17, [x3, w14, uxtw #3]"); for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { // assuming 64-bit pointers and 4K page size // use x16 = label index string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx); sb.AppendLine($" adrp x10, {targetLabelName}"); sb.AppendLine($" add x10, x10, :lo12:{targetLabelName}"); sb.AppendLine(" str x10, [x17, w16, uxtw #3]"); sb.AppendLine(" add w16, w16, 1"); } sb.AppendLine(" eor x16, x16, x16"); sb.AppendLine(" add w14, w14, 1"); } // w14 = branch index, w16 = pattern (target) array index sb.AppendLine(loopLabel + ":"); sb.AppendLine(" eor w14, w14, w14"); // generate branch blocks for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { // get a pointer to the jump table sb.AppendLine(" ldr x9, [x3, w14, uxtw #3]"); // look up which target to jump to sb.AppendLine(" ldr x15, [x1, w14, uxtw #3]"); sb.AppendLine(" add w14, w14, 1"); sb.AppendLine(" ldr w13, [x15, w16, uxtw #2]"); // use the target index (w13) to index into the jump table, and branch on it sb.AppendLine(" ldr x17, [x9, w13, uxtw #3]"); // global history assist branches // rax = index into jump table. make that correlate with global history if (this.assists) { sb.AppendLine(" mov x18, 1"); sb.AppendLine(" eor w12, w12, w12"); for (int eaxBits = 0; eaxBits < globalHistoryAssistBits; eaxBits++) { string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits; sb.AppendLine(" and w12, w13, w18"); sb.AppendLine($" cbnz w12, {targetName}"); sb.AppendLine(" nop"); sb.AppendLine($"{targetName}:"); sb.AppendLine(" lsl w18, w18, 1"); } } // branch on value of x17 sb.AppendLine($" br x17"); sb.AppendLine(" nop"); // generate targets for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":"); sb.AppendLine($" nop"); } } // increment w16, and basically cmov 0 -> w16 if w16 = list length sb.AppendLine(" add w16, w16, 1"); sb.AppendLine(" cmp w16, w2"); sb.AppendLine(" csel w16, w11, w16, EQ"); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine($" cbnz x0, {loopLabel}"); sb.AppendLine(" mov x0, x12"); sb.AppendLine(" ldp x9, x10, [sp, #0x40]"); sb.AppendLine(" ldp x11, x12, [sp, #0x30]"); sb.AppendLine(" ldp x15, x16, [sp, #0x20]"); sb.AppendLine(" ldp x13, x14, [sp, #0x10]"); sb.AppendLine(" ldp x17, x18, [sp, #0x40]"); sb.AppendLine(" add sp, sp, #0x60"); sb.AppendLine(" ret"); } } } public void GenerateX86GccAsm(StringBuilder sb) { for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) { int currentTargetCount = targetCounts[targetCountIdx]; for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++) { /* rdi = iteration count * rsi = array of target selection arrays, one for each branch * rdx = length of pattern array * rcx = array of jump tables, one for each branch */ int currentBranchCount = branchCounts[branchCountIdx]; string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount); sb.AppendLine("\n" + functionLabel + ":"); sb.AppendLine(" push %rbx"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %r9"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" xor %rbx, %rbx"); sb.AppendLine(" xor %r8, %r8"); sb.AppendLine(" xor %r9, %r9"); // initialize jump table for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { // rcx = array of ptrs to jump tables // r9 = index into array of jump tables // r15 = ptr to jump table // load jump table base address into r15 sb.AppendLine(" mov (%rcx,%r9,8), %r15"); for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { // assuming 64-bit pointers and 4K page size // use rbx = index into string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx); sb.AppendLine($" lea {targetLabelName}(%rip), %rax"); sb.AppendLine($" mov %rax, (%r15,%rbx,8)"); sb.AppendLine(" inc %rbx"); } sb.AppendLine(" xor %rbx, %rbx"); sb.AppendLine(" inc %r9"); } sb.AppendLine(" xor %r8, %r8"); sb.AppendLine(" xor %r9, %r9"); string loopLabel = functionLabel + "_loop"; sb.AppendLine("\n" + loopLabel + ":"); sb.AppendLine(" xor %r11, %r11"); // set index into arr of arrs to 0 for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { sb.AppendLine(" mov (%rcx,%r11,8), %r15"); // load jump table base pointer into r15 sb.AppendLine(" mov (%rsi,%r11,8), %r10"); // load target select array base pointer into r10 sb.AppendLine(" inc %r11"); sb.AppendLine(" mov (%r10,%rbx,4), %eax"); // get the target for the current iteration into eax sb.AppendLine(" mov (%r15,%rax,8), %r14"); // load address of jump target from jump table if (assists) { sb.AppendLine(" mov %rsi, %r13"); sb.AppendLine(" mov $1, %rsi"); for (int eaxBits = 0; eaxBits < 7; eaxBits++) { string targetName = functionLabel + "branch" + branchIdx + "ghist" + eaxBits; sb.AppendLine(" test %eax, %esi"); sb.AppendLine($" jnz {targetName}"); sb.AppendLine(" nop"); sb.AppendLine($"{targetName}:"); sb.AppendLine(" shl $1, %esi"); } } sb.AppendLine(" mov %r13, %rsi"); sb.AppendLine(" jmp *%r14"); // and jump to it // generate targets for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":"); sb.AppendLine($" nop"); } } // loop around in pattern history test array if necessary // avoiding an extra branch to not pollute BPU history sb.AppendLine(" inc %rbx"); sb.AppendLine(" cmp %rbx, %rdx"); sb.AppendLine(" cmove %r9, %rbx"); // end of main loop over iteration count sb.AppendLine(" dec %rdi"); sb.AppendLine(" jnz " + loopLabel); // function epilogue sb.AppendLine(" mov %r8, %rax"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r9"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %rbx"); sb.AppendLine(" ret"); } } } public void GenerateMipsAsm(StringBuilder sb) { for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) { int currentTargetCount = targetCounts[targetCountIdx]; for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++) { /* r4 = iteration count * r5 = array of target selection arrays, one for each branch * r6 = length of pattern array * r7 = array of jump tables, one for each branch */ int currentBranchCount = branchCounts[branchCountIdx]; string functionLabel = GetFunctionName(currentBranchCount, currentTargetCount); sb.AppendLine("\n" + functionLabel + ":"); // initialize jump tables. r12-r20 are temporary regs. sb.AppendLine(" move $r13, $r7"); // use r13 to access array of pointers to jump tables for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { sb.AppendLine(" ld.d $r15, $r13, 0"); // load address of branch's jump table into r15 // initialize the jump table. r15 = base addr. rely on C# for bounds :) for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { // write label addresses into array string targetLabelName = GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx); sb.AppendLine(" la $r16, " + targetLabelName); // load branch target address into r16 sb.AppendLine(" st.d $r16, $r15, 0"); // store branch target address sb.AppendLine(" addi.d $r15, $r15, 8"); // increment array pointer } sb.AppendLine(" addi.d $r13, $r13, 8"); // increment array pointer for array of pointers to jump tables } // loop through branches for (iterations) times string loopLabel = functionLabel + "_loop"; sb.AppendLine(" move $r14, $r0"); // r14 = branch target index sb.AppendLine(" move $r17, $r0"); sb.AppendLine(" addi.d $r17, $r17, 1"); // use r17 just to store 1 sb.AppendLine("\n" + loopLabel + ":"); sb.AppendLine(" move $r12, $r5"); // r12 to hold pointer to target selection array sb.AppendLine(" move $r13, $r7"); // r13 to hold pointer to jump target array for (int branchIdx = 0; branchIdx < currentBranchCount; branchIdx++) { sb.AppendLine(" ld.d $r16, $r12, 0"); // r16 = base address of target select array sb.AppendLine(" ld.d $r18, $r13, 0"); // r18 = base address of jump target array // target select array[target index] sb.AppendLine(" alsl.d $r15, $r14, $r0, 0x2"); sb.AppendLine(" add.d $r15, $r15, $r16"); sb.AppendLine(" ld.w $r19, $r15, 0"); // load 32-bit target index sb.AppendLine(" alsl.d $r15, $r19, $r0, 0x3"); // now index into jump table sb.AppendLine(" add.d $r15, $r18, $r15"); sb.AppendLine(" ld.d $r20, $r15, 0"); // increment pointers for next branch sb.AppendLine(" addi.d $r12, $r12, 8"); sb.AppendLine(" addi.d $r13, $r13, 8"); sb.AppendLine(" jr $r20"); // generate targets for (int targetIdx = 0; targetIdx < currentTargetCount; targetIdx++) { sb.AppendLine(GetTargetLabelName(currentBranchCount, currentTargetCount, branchIdx, targetIdx) + ":"); sb.AppendLine($" nop"); } } // loop back. and try to reset branch index without a branch sb.AppendLine(" addi.d $r14, $r14, 1"); // if r14 == r6 (pattern array length), set r14 back to 0 somehow sb.AppendLine(" sub.d $r12, $r14, $r6"); // 12 = temporary result of comparison sb.AppendLine(" maskeqz $r14, $r14, $r12"); // if r12 = 0, set r14 to 0. otherwise use current value sb.AppendLine(" sub.d $r4, $r4, $r17"); sb.AppendLine(" bnez $r4, " + loopLabel); sb.AppendLine(" jr $r1"); } } } // kinda hack this to put in initialization code we need public void GenerateExternLines(StringBuilder sb) { for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++) for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) sb.AppendLine("extern uint64_t " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx]) + $"({FunctionDefinitionParameters}) __attribute((sysv_abi));"); GenerateInitializationCode(sb); string gccFunction = File.ReadAllText(Path.Combine(Program.DataFilesDir, "GccIndirectBranchFunction.c")); sb.AppendLine(gccFunction); } public void GenerateInitializationCode(StringBuilder sb) { sb.AppendLine($"uint32_t maxIndirectBranchCount = {branchCounts.Length};"); sb.Append($"uint32_t indirectBranchCounts[{branchCounts.Length}] = "); sb.Append("{ " + branchCounts[0]); for (int i = 1; i < branchCounts.Length; i++) sb.Append(", " + branchCounts[i]); sb.AppendLine(" };"); sb.Append($"uint32_t indirectBranchTargetCounts[{targetCounts.Length}] = "); sb.Append("{ " + targetCounts[0]); for (int i = 1; i < targetCounts.Length; i++) sb.Append(", " + targetCounts[i]); sb.AppendLine(" };"); // TODO: need to make this a 2D array - [branch count][target count] sb.AppendLine($"uint64_t (__attribute((sysv_abi)) *indirectBranchTestFuncArr[{branchCounts.Length}][{targetCounts.Length}])({FunctionDefinitionParameters});"); sb.AppendLine("void initializeIndirectBranchFuncArr() {"); for (int i = 0; i < branchCounts.Length; i++) { for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) { sb.AppendLine($" indirectBranchTestFuncArr[{i}][{targetCountIdx}] = {GetFunctionName(branchCounts[i], targetCounts[targetCountIdx])};"); } } sb.AppendLine("}"); } public string Prefix { get; set; } public string Description { get; set; } public int[] Counts; public string FunctionDefinitionParameters { get; set; } public string GetFunctionCallParameters { get; set; } public bool DivideTimeByCount { get; set; } public void GenerateAsmGlobalLines(StringBuilder sb) { for (int branchCountIdx = 0; branchCountIdx < branchCounts.Length; branchCountIdx++) for (int targetCountIdx = 0; targetCountIdx < targetCounts.Length; targetCountIdx++) sb.AppendLine(".global " + GetFunctionName(branchCounts[branchCountIdx], targetCounts[targetCountIdx])); } public void GenerateTestBlock(StringBuilder sb, IUarchTest.ISA isa) { sb.AppendLine(" if (argc > 1 && strncmp(test_name, \"" + Prefix + "\", " + Prefix.Length + ") == 0) {"); sb.AppendLine(" printf(\"" + Description + ":\\n\");"); string ibMain = File.ReadAllText(Path.Combine(Program.DataFilesDir, "IndirectBranchTestBlock.c")); sb.AppendLine(ibMain); sb.AppendLine(" }\n"); } } } ================================================ FILE: AsmGen/tests/IntRfDepStoreTest.cs ================================================ using System.Text; namespace AsmGen { public class IntRfTestDependentStore : UarchTest { public IntRfTestDependentStore(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "intrfds"; this.Description = "Integer Register File, preceded by a dependent store"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add %r11, %r15"; unrolledAdds[1] = " add %r11, %r14"; unrolledAdds[2] = " add %r11, %r13"; unrolledAdds[3] = " add %r11, %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = "str w15, [x2, w25, uxtw #2]"; string postLoadInstrs2 = "str w15, [x2, w26, uxtw #2]"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x15, x15, x11"; unrolledAdds[1] = " add x14, x14, x11"; unrolledAdds[2] = " add x13, x13, x11"; unrolledAdds[3] = " add x12, x12, x11"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add.d $r15, $r15, $r14"; unrolledAdds[1] = " add.d $r16, $r16, $r14"; unrolledAdds[2] = " add.d $r17, $r17, $r14"; unrolledAdds[3] = " add.d $r18, $r18, $r14"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x28, x28, x29"; unrolledAdds[1] = " add x30, x30, x29"; unrolledAdds[2] = " add x31, x31, x29"; unrolledAdds[3] = " add x18, x18, x29"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } } } } ================================================ FILE: AsmGen/tests/IntRfTest.cs ================================================ using System.Text; namespace AsmGen { public class IntRfTest : UarchTest { private bool initialDependentBranch; public IntRfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "intrf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Integer Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add %r11, %r15"; unrolledAdds[1] = " add %r11, %r14"; unrolledAdds[2] = " add %r11, %r13"; unrolledAdds[3] = " add %r11, %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x15, x15, x11"; unrolledAdds[1] = " add x14, x14, x11"; unrolledAdds[2] = " add x13, x13, x11"; unrolledAdds[3] = " add x12, x12, x11"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add.d $r15, $r15, $r14"; unrolledAdds[1] = " add.d $r16, $r16, $r14"; unrolledAdds[2] = " add.d $r17, $r17, $r14"; unrolledAdds[3] = " add.d $r18, $r18, $r14"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add x28, x28, x29"; unrolledAdds[1] = " add x30, x30, x29"; unrolledAdds[2] = " add x31, x31, x29"; unrolledAdds[3] = " add x18, x18, x29"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); } } } } ================================================ FILE: AsmGen/tests/JsCvtNsq.cs ================================================ using System.Text; namespace AsmGen { public class JsCvtNsq : UarchTest { private int totalOps; public JsCvtNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "jscvtnsq"; this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; string initInstrs = " ldr d15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " fjcvtzs w15, d16"; depInstrs[1] = " fjcvtzs w14, d16"; depInstrs[2] = " fjcvtzs w13, d16"; depInstrs[3] = " fjcvtzs w12, d16"; string[] indepInstrs = new string[4]; indepInstrs[0] = " fjcvtzs w15, d15"; indepInstrs[1] = " fjcvtzs w14, d15"; indepInstrs[2] = " fjcvtzs w13, d15"; indepInstrs[3] = " fjcvtzs w12, d15"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/JsCvtSched.cs ================================================ using System.Text; namespace AsmGen { public class JsCvtSched : UarchTest { public JsCvtSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "jscvtsched"; this.Description = "FJCVTZS (FP Javascript Convert to Signed Fixed Point, Rounding toward Zero) Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr d16, [x2, w25, sxtw #0]"; string postLoadInstrs2 = " ldr d16, [x2, w25, sxtw #0]"; string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " fjcvtzs w15, d16"; unrolledInstrs[1] = " fjcvtzs w14, d16"; unrolledInstrs[2] = " fjcvtzs w13, d16"; unrolledInstrs[3] = " fjcvtzs w12, d16"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/JumpNsqTest.cs ================================================ using System.Text; namespace AsmGen { public class JumpNsqTest : UarchTest { public JumpNsqTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "jumpnsq"; this.Description = "Scheduler, Not-Taken Jumps, excluding possible nsq"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; // if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentJumps = new string[1]; dependentJumps[0] = " cmp %rdi, %rsi\n je jumpnsq_reallybadthing"; string[] independentJumps = new string[1]; independentJumps[0] = " cmp %r13, %r14\n je jumpnsq_reallybadthing"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentJumps, independentJumps); sb.AppendLine("jumpnsq_reallybadthing:\n int3"); } } } } ================================================ FILE: AsmGen/tests/JumpSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class JumpSchedTest : UarchTest { public JumpSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "jumpsched"; this.Description = "Scheduler, Not-Taken Jumps"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledJumps = new string[1]; unrolledJumps[0] = " cmp %rdi, %rsi\n je jumpsched_reallybadthing"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("jumpsched_reallybadthing:\n int3"); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledJumps = new string[1]; unrolledJumps[0] = " cmp x25, x26\n b.eq jumpsched_reallybadthing"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("jumpsched_reallybadthing:\n .word 0xf7f0a000"); } else if (isa == IUarchTest.ISA.riscv) { // todo string[] unrolledJumps = new string[1]; unrolledJumps[0] = " beq x5, x6, jumpsched_reallybadthing"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, false); sb.AppendLine("jumpsched_reallybadthing:\n .word 0x00000000"); } } } } ================================================ FILE: AsmGen/tests/LdqTest.cs ================================================ using System.Text; namespace AsmGen { public class LdqTest : UarchTest { bool initialDependentBranch; public LdqTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "ldq" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Load Queue" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledLoads = new string[4]; unrolledLoads[0] = " mov (%r8), %r15"; unrolledLoads[1] = " mov (%r8), %r14"; unrolledLoads[2] = " mov (%r8), %r13"; unrolledLoads[3] = " mov (%r8), %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstr = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] unrolledLoads = new string[4]; unrolledLoads[0] = " ldr x15, [x2]"; unrolledLoads[1] = " ldr x14, [x2]"; unrolledLoads[2] = " ldr x13, [x2]"; unrolledLoads[3] = " ldr x12, [x2]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstr, postLoadInstrs2: postLoadInstr); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledLoads = new string[4]; unrolledLoads[0] = " ld.d $r15, $r6, 0"; unrolledLoads[1] = " ld.d $r16, $r6, 8"; unrolledLoads[2] = " ld.d $r17, $r6, 16"; unrolledLoads[3] = " ld.d $r18, $r6, 24"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null; string[] unrolledLoads = new string[4]; unrolledLoads[0] = " ld x28, (x11)"; unrolledLoads[1] = " ld x29, 8(x11)"; unrolledLoads[2] = " ld x30, 16(x11)"; unrolledLoads[3] = " ld x31, 24(x11)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledLoads, unrolledLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/LeaSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class LeaSchedTest : UarchTest { public LeaSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "leasched"; this.Description = "Scheduler, lea with base + index + offset"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " lea 128(%r15, %rdi), %r15"; unrolledAdds[1] = " lea 128(%r14, %rdi), %r14"; unrolledAdds[2] = " lea 128(%r13, %rdi), %r13"; unrolledAdds[3] = " lea 128(%r12, %rdi), %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); } } } } ================================================ FILE: AsmGen/tests/LoadNsq.cs ================================================ using System.Text; namespace AsmGen { public class LoadNsq : UarchTest { public LoadNsq(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "loadnsq"; this.Description = "Load Address Scheduler, Excluding any NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dep = new string[3]; dep[0] = " mov (%r8, %rdi, 4), %r15"; dep[1] = " mov (%r8, %rdi, 4), %r14"; dep[2] = " mov (%r8, %rdi, 4), %r13"; string[] indep = new string[3]; indep[0] = " mov (%r8), %r15"; indep[1] = " mov (%r8), %r14"; indep[2] = " mov (%r8), %r13"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep, ptrChasingLoadsInSq: true); } if (isa == IUarchTest.ISA.aarch64) { string[] dep = new string[3]; dep[0] = " ldr w15, [x2, w25, uxtw #2]"; dep[1] = " ldr w14, [x2, w25, uxtw #2]"; dep[2] = " ldr w13, [x2, w25, uxtw #2]"; string[] indep = new string[3]; indep[0] = " ldr w12, [x2]"; indep[1] = " ldr w11, [x2]"; indep[2] = " ldr w10, [x2]"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dep, indep); } } } } ================================================ FILE: AsmGen/tests/LoadSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class LoadSchedTest : UarchTest { public LoadSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "loadsched"; this.Description = "Load Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " mov (%r8, %rdi, 4), %r15"; dependentLoads[1] = " mov (%r8, %rdi, 4), %r14"; dependentLoads[2] = " mov (%r8, %rdi, 4), %r13"; dependentLoads[3] = " mov (%r8, %rdi, 4), %r12"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " mov (%r8, %rsi, 4), %r15"; dependentLoads1[1] = " mov (%r8, %rsi, 4), %r14"; dependentLoads1[2] = " mov (%r8, %rsi, 4), %r13"; dependentLoads1[3] = " mov (%r8, %rsi, 4), %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " ldr w15, [x2, w25, uxtw #2]"; dependentLoads[1] = " ldr w14, [x2, w25, uxtw #2]"; dependentLoads[2] = " ldr w13, [x2, w25, uxtw #2]"; dependentLoads[3] = " ldr w12, [x2, w25, uxtw #2]"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " ldr w15, [x2, w26, uxtw #2]"; dependentLoads1[1] = " ldr w14, [x2, w26, uxtw #2]"; dependentLoads1[2] = " ldr w13, [x2, w26, uxtw #2]"; dependentLoads1[3] = " ldr w12, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.mips64) { string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6"; string[] dependentLoads = new string[4]; dependentLoads[0] = " ld.d $r15, $r19, 0"; dependentLoads[1] = " ld.d $r16, $r19, 8"; dependentLoads[2] = " ld.d $r17, $r19, 12"; dependentLoads[3] = " ld.d $r18, $r19, 16"; string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } else if (isa == IUarchTest.ISA.riscv) { // x5 and x6 are pointer chasing loads string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12"; string[] dependentLoads = new string[4]; dependentLoads[0] = " ld x28, (x7)"; dependentLoads[1] = " ld x29, 8(x7)"; dependentLoads[2] = " ld x30, 16(x7)"; dependentLoads[3] = " ld x31, 24(x7)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/MaddSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MaddSchedTest : UarchTest { public MaddSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "maddsched"; this.Description = "Scheduler, Integer Multiply-Add"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string[] unrolledMuls = new string[4]; unrolledMuls[0] = " madd x15, x15, x25, x10"; unrolledMuls[1] = " madd x14, x14, x25, x10"; unrolledMuls[2] = " madd x13, x13, x25, x10"; unrolledMuls[3] = " madd x12, x12, x25, x10"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false); } } } } ================================================ FILE: AsmGen/tests/MaskRfTest.cs ================================================ using System.Text; namespace AsmGen { public class MaskRfTest : UarchTest { public MaskRfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "maskrf"; this.Description = "Mask Registers - AVX-512 only"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " kaddb %k0, %k1, %k1"; unrolledAdds[1] = " kaddb %k0, %k2, %k2"; unrolledAdds[2] = " kaddb %k0, %k3, %k3"; unrolledAdds[3] = " kaddb %k0, %k4, %k4"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false); } } } } ================================================ FILE: AsmGen/tests/MixAddJumpSched.cs ================================================ using System.Text; namespace AsmGen { public class MixAddJumpSchedTest : UarchTest { public MixAddJumpSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixaddjumpsched"; this.Description = "Scheduler, Mixed Adds and Not-Taken Jumps in 3:1 ratio"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledJumps = new string[4]; unrolledJumps[0] = " cmp %rdi, %rsi\n je mixaddjumpsched_reallybadthing"; unrolledJumps[1] = " add %rsi, %r15"; unrolledJumps[2] = " add %rsi, %r14"; unrolledJumps[3] = " add %rsi, %r14"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("mixaddjumpsched_reallybadthing:\n int3"); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledJumps = new string[4]; unrolledJumps[0] = " cmp x25, x26\n b.eq mixaddjumpsched_reallybadthing"; unrolledJumps[1] = " add x15, x15, x25"; unrolledJumps[2] = " add x14, x14, x25"; unrolledJumps[3] = " add x14, x14, x25"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("mixaddjumpsched_reallybadthing:\n .word 0xf7f0a000"); } else if (isa == IUarchTest.ISA.riscv) { // todo string[] unrolledAdds = new string[4]; unrolledAdds[0] = " mul x30, x30, x5"; unrolledAdds[1] = " mul x29, x29, x5"; unrolledAdds[2] = " mul x28, x28, x5"; unrolledAdds[3] = " mul x31, x31, x5"; string[] unrolledAdds1 = new string[4]; unrolledAdds1[0] = " mul x30, x30, x6"; unrolledAdds1[1] = " mul x31, x31, x6"; unrolledAdds1[2] = " mul x28, x28, x6"; unrolledAdds1[3] = " mul x29, x29, x6"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false); } } } } ================================================ FILE: AsmGen/tests/MixAddvJsCvtNsq.cs ================================================ using System.Text; namespace AsmGen { public class MixAddvJsCvtNsq : UarchTest { public MixAddvJsCvtNsq(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixaddvjscvtnsq"; this.Description = "ADDV and fjcvtzs Scheduler, Excluding any NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]"; string initInstrs = " ldr q17, [x2]\n ldr d15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " addv h1, v16.4h"; depInstrs[1] = " fjcvtzs w15, d2"; depInstrs[2] = " addv h3, v16.4h"; depInstrs[3] = " fjcvtzs w14, d2"; string[] indepInstrs = new string[4]; indepInstrs[0] = " addv h4, v17.4h"; indepInstrs[1] = " fjcvtzs w12, d15"; indepInstrs[2] = " addv h5, v17.4h"; indepInstrs[3] = " fjcvtzs w13, d15"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs: initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/MixAddvJsCvtSched.cs ================================================ using System.Text; namespace AsmGen { public class MixAddvJsCvtSched : UarchTest { public MixAddvJsCvtSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixaddvjscvtsched"; this.Description = "ADDV and fjcvtzs Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr q16, [x2, w25, sxtw #0]\n ldr d2, [x2, w25, sxtw #0]"; string postLoadInstrs2 = " ldr q16, [x2, w26, sxtw #0]\n ldr d2, [x2, w26, sxtw #0]"; string[] unrolledInstrs = new string[4]; unrolledInstrs[0] = " addv h1, v16.4h"; unrolledInstrs[1] = " fjcvtzs w15, d2"; unrolledInstrs[2] = " addv h3, v16.4h"; unrolledInstrs[3] = " fjcvtzs w14, d2"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/MixBranchStoreTest.cs ================================================ using System.Text; namespace AsmGen { public class MixBranchStoreTest : UarchTest { private bool mixNops; private bool initialDependentBranch; public MixBranchStoreTest(int low, int high, int step, bool mixNops = false, bool initialDependentBranch = false) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixstqbob" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Mixed NT branches and stores" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); ; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.mixNops = mixNops; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } public void GenerateArmAsm(StringBuilder sb) { string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; for (int i = 0; i < Counts.Length; i++) { string funcName = Prefix + Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); sb.AppendLine(" mov x10, 6"); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w25_target{fillerIdx}"; sb.AppendLine($" cmp x15, x10"); sb.AppendLine($" b.eq {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w26_target{fillerIdx}"; sb.AppendLine($" cmp x15, x10"); sb.AppendLine($" b.eq {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } } } ================================================ FILE: AsmGen/tests/MixFAdd256and32RfTest.cs ================================================ using System.Text; namespace AsmGen { public class MixFAdd256and32RfTest : UarchTest { public MixFAdd256and32RfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "fadd256and32rf"; this.Description = "Mixed 32-bit scalar and 256-bit FP RF capacity"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vmovups (%r8), %ymm0\n" + " movss (%r8), %xmm1\n" + " vmovups %ymm0, %ymm2\n" + " movss (%r8), %xmm3\n" + " vmovups %ymm0, %ymm4\n" + " movss (%r8), %xmm5\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " vaddps %ymm0, %ymm1, %ymm1"; unrolledAdds[1] = " addss %xmm5, %xmm2"; unrolledAdds[2] = " vaddps %ymm0, %ymm3, %ymm3"; unrolledAdds[3] = " addss %xmm5, %xmm4"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs); } else if (isa == IUarchTest.ISA.aarch64) { } else if (isa == IUarchTest.ISA.mips64) { string initInstrs = ""; for (int regIdx = 0; regIdx < 32; regIdx++) { initInstrs += " xvld $xr" + regIdx + ", $r6, " + regIdx * 32 + "\n"; initInstrs += " fld.s $f" + regIdx + ", $r6, " + regIdx * 4 + "\n"; } string[] unrolledAdds = new string[4]; unrolledAdds[0] = " xvfadd.s $xr1, $xr1, $xr1"; unrolledAdds[1] = " fadd.s $f11, $f11, $f11"; unrolledAdds[2] = " xvfadd.s $xr3, $xr3, $xr3"; unrolledAdds[3] = " fadd.s $f12, $f12, $f12"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs: initInstrs); } } } } ================================================ FILE: AsmGen/tests/MixFpRfDepBranchTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixFpRfDepBranchTest : UarchTest { private int interval; public MixFpRfDepBranchTest(int low, int high, int step, int interval) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixfprfdepbranch" + interval; this.Description = "FP Register File, with some dependent branches"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *fpArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.interval = interval; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string initInstrs = " ldr s17, [x2]\n" + " ldr s18, [x2, 4]\n" + " ldr s19, [x2, 8]\n" + " ldr s20, [x2, 12]\n" + " ldr s21, [x2, 16]\n"; List unrolledAddsList = new List(); for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++) { int regnum = 18 + (i % 4); unrolledAddsList.Add($" fadd s{regnum}, s{regnum}, s17"); if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixfpjumpsched_badthing" + interval); } string[] unrolledAdds = unrolledAddsList.ToArray(); UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true, initInstrs: initInstrs); sb.AppendLine($"mixfpjumpsched_badthing{interval}:\n .word 0xf7f0a000"); } } } } ================================================ FILE: AsmGen/tests/MixFpVecRfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixFpVecRfTest : UarchTest { private bool initialDependentBranch; public MixFpVecRfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixfpvecrf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Mixed FP/128-bit FP vec rf" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.riscv) { string initInstrs = " vsetvli t5, t6, e32\n vlw.v v0, (a1)\n fld f0, (a1)"; string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty; postLoadInstrs += "\n mv t6, a2"; string[] unrolledInstrs = new string[2]; unrolledInstrs[0] = " vfadd.vv v0, v0, v0"; unrolledInstrs[1] = " fadd.s f0, f0, f0"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/MixIntRfDepBranchTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixIntRfDepBranchTest : UarchTest { private int interval; public MixIntRfDepBranchTest(int low, int high, int step, int interval) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixintrfdepbranch" + interval; this.Description = "Integer Register File, with some dependent branches"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.interval = interval; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { List unrolledAddsList = new List(); for (int i = 1; i < this.Counts[this.Counts.Length - 1] + 1; i++) { int regnum = 12 + (i % 4); unrolledAddsList.Add($" add x{regnum}, x{regnum}, x11"); if (i % interval == 0) unrolledAddsList.Add(" cmp x25, x26\n b.eq mixintjumpsched_badthing" + interval); } string[] unrolledAdds = unrolledAddsList.ToArray(); UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: true); sb.AppendLine($"mixintjumpsched_badthing{interval}:\n .word 0xf7f0a000"); } } } } ================================================ FILE: AsmGen/tests/MixIntVec128RfTest.cs ================================================ using System.Text; namespace AsmGen { public class MixIntVec128RfTest : UarchTest { private bool initialDependentBranch; public MixIntVec128RfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixintvec128" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Mixed integer and 128-bit vector register file capacity" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vmovups (%r8), %ymm0\n" + " movss (%r8), %xmm1\n" + " vmovups %ymm0, %ymm2\n" + " movss (%r8), %xmm3\n" + " vmovups %ymm0, %ymm4\n" + " movss (%r8), %xmm5\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add %r11, %r15"; unrolledAdds[1] = " addss %xmm5, %xmm2"; unrolledAdds[2] = " add %r11, %r14"; unrolledAdds[3] = " addss %xmm5, %xmm4"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string initInstrs = " ldr q0, [x1]\n" + " ldr q1, [x1, #0x10]\n" + " ldr q2, [x1, #0x20]\n" + " ldr q3, [x1, #0x30]\n" + " ldr q4, [x1, #0x40]\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " add v1.4s, v1.4s, v0.4s"; unrolledAdds[1] = " add x15, x15, x11"; unrolledAdds[2] = " add v2.4s, v2.4s, v0.4s"; unrolledAdds[3] = " add x14, x14, x11"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/MixIntrfFprfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixIntFpRfTest : UarchTest { private bool initialDependentBranch; public MixIntFpRfTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixintfprf" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Mixed INT/FP Register File" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { //if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; //if (isa == IUarchTest.ISA.amd64) return true; //if (isa == IUarchTest.ISA.aarch64) return true; //if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // todo string initInstrs = " movss (%r8), %xmm1\n" + " movss 4(%r8), %xmm2\n" + " movss 8(%r8), %xmm3\n" + " movss 12(%r8), %xmm4\n" + " movss 16(%r8), %xmm5\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " addss %xmm1, %xmm2"; unrolledAdds[1] = " addss %xmm1, %xmm3"; unrolledAdds[2] = " addss %xmm1, %xmm4"; unrolledAdds[3] = " addss %xmm1, %xmm5"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs); } else if (isa == IUarchTest.ISA.aarch64) {// todo string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string initInstrs = " ldr s17, [x2]\n" + " ldr s18, [x2, 4]\n" + " ldr s19, [x2, 8]\n" + " ldr s20, [x2, 12]\n" + " ldr s21, [x2, 16]\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd s18, s18, s17"; unrolledAdds[1] = " fadd s19, s19, s17"; unrolledAdds[2] = " fadd s20, s20, s17"; unrolledAdds[3] = " fadd s21, s21, s17"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) {// todo string initInstrs = " fld.s $f8, $r6, 0\n" + " fld.s $f9, $r6, 4\n" + " fld.s $f10, $r6, 8\n" + " fld.s $f11, $r6, 12\n" + " fld.s $f12, $r6, 16\n"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " fadd.s $f9, $f9, $f8"; unrolledAdds[1] = " fadd.s $f10, $f10, $f8"; unrolledAdds[2] = " fadd.s $f11, $f11, $f8"; unrolledAdds[3] = " fadd.s $f12, $f12, $f8"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false, initInstrs); } else if (isa == IUarchTest.ISA.riscv) { if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty; string initInstrs = " fld f0, (x12)\n" + " fld f1, 8(x12)\n" + " fld f2, 16(x12)\n" + " fld f3, 24(x12)\n" + " fld f4, 32(x12)\n"; List unrolledAdds = new List(); /* for C910 */ for (int i = 0; i < 30; i++) unrolledAdds.Add($" fadd.s f{i % 4}, f{i % 4}, f4"); for (int i = 0; i < 200; i++) unrolledAdds.Add($" add x28, x28, x29"); /*unrolledAdds.Add(" fadd.s f0, f0, f4"); unrolledAdds.Add(" add x28, x28, x29"); unrolledAdds.Add(" fadd.s f1, f1, f4"); unrolledAdds.Add(" add x30, x30, x29"); unrolledAdds.Add(" fadd.s f2, f2, f4"); unrolledAdds.Add(" add x31, x31, x29"); unrolledAdds.Add(" fadd.s f3, f3, f4"); unrolledAdds.Add(" add x18, x18, x29");*/ string[] unrolledAddsArr = unrolledAdds.ToArray(); UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAddsArr, unrolledAddsArr, includePtrChasingLoads: false, initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); } } } } ================================================ FILE: AsmGen/tests/MixJumpStoreDataSched.cs ================================================ using System.Text; namespace AsmGen { public class MixJumpStoreDataSched : UarchTest { public MixJumpStoreDataSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixjumpstoredatasched"; this.Description = "Scheduler, Mixed Jumps and Store Data"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; //if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledJumps = new string[4]; unrolledJumps[0] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing"; unrolledJumps[1] = " mov %rdi, (%r8)"; unrolledJumps[2] = " cmp %rdi, %rsi\n je mixjumpstoredatasched_reallybadthing"; unrolledJumps[3] = " mov %rdi, 64(%r8)"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("mixjumpstoredatasched_reallybadthing:\n int3"); } } } } ================================================ FILE: AsmGen/tests/MixJumpStoreSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MixJumpStoreSchedTest : UarchTest { public MixJumpStoreSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixjumpstoresched"; this.Description = "Scheduler, Mixed Jumps and Stores (Address Dependency)"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatarr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; //if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledJumps = new string[4]; unrolledJumps[0] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing"; unrolledJumps[1] = " mov %r14, (%r8, %rdi, 2)"; unrolledJumps[2] = " cmp %rdi, %rsi\n je mixstorejumpsched_reallybadthing"; unrolledJumps[3] = " mov %r14, 64(%r8, %rdi, 2)"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledJumps, unrolledJumps, includePtrChasingLoads: true); sb.AppendLine("mixstorejumpsched_reallybadthing:\n int3"); } } } } ================================================ FILE: AsmGen/tests/MixJumpThenAddSched.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixJumpThenAddSched : UarchTest { public MixJumpThenAddSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixjumpthenaddsched"; this.Description = "Scheduler, 40 NT jumps + adds"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { // if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { List unrolledJumps = new List(); int instrIdx; for (instrIdx = 0; instrIdx < 40; instrIdx++) unrolledJumps.Add(" cmp x25, x26\n b.eq mixaddthenjumpsched_reallybadthing"); for (; instrIdx < this.Counts[this.Counts.Length - 1]; instrIdx++) unrolledJumps.Add(" add x15, x15, x25"); string[] instrs = unrolledJumps.ToArray(); UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, includePtrChasingLoads: true, dsb: true); sb.AppendLine("mixaddthenjumpsched_reallybadthing:\n .word 0xf7f0a000"); } } } } ================================================ FILE: AsmGen/tests/MixLdqStqTest.cs ================================================ using System.Text; namespace AsmGen { public class MixLdqStqTest : UarchTest { private bool initialDependentBranch; public MixLdqStqTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixldqstq" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Mixed Load/Store Queue Test (mem ops pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr1"; this.GetFunctionCallParameters = "structIterations, A, B"; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86GccAsm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } } public void GenerateX86GccAsm(StringBuilder sb) { string[] instrs = new string[4]; instrs[0] = " mov %r15, (%r8)"; instrs[1] = " mov (%rdx), %r14"; instrs[2] = " mov %r13, (%r8)"; instrs[3] = " mov (%rdx), %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, instrs, instrs, true); } public void GenerateArmAsm(StringBuilder sb) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] instrs = new string[4]; instrs[0] = " str x15, [x2]"; instrs[1] = " ldr x14, [x1]"; instrs[2] = " str x13, [x2]"; instrs[3] = " ldr x12, [x1]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, instrs, instrs, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } } ================================================ FILE: AsmGen/tests/MixLoadStoreDivSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MixLoadStoreDivSchedTest : UarchTest { public MixLoadStoreDivSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixloadstoredivsched"; this.Description = "Load/Store Scheduler Capacity Test, using divs to block retirement"; this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2, int *arr3"; this.GetFunctionCallParameters = "structIterations, list_size, B, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86Asm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } } public void GenerateX86Asm(StringBuilder sb) { string[] dependentLoads = new string[2]; dependentLoads[0] = " mov (%r9, %rdx, 4), %r15"; dependentLoads[1] = " mov %r14, (%r8, %rdx, 4)"; UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, false); } public void GenerateArmAsm(StringBuilder sb) { string[] dependentLoads = new string[2]; dependentLoads[0] = " ldr w15, [x3, w25, uxtw #2]"; dependentLoads[1] = " str w14, [x2, w25, uxtw #2]"; string[] dependentLoads1 = new string[2]; dependentLoads1[0] = " ldr w15, [x3, w26, uxtw #2]"; dependentLoads1[1] = " str w14, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, false); } } } ================================================ FILE: AsmGen/tests/MixLoadStoreSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MixLoadStoreSched : UarchTest { public MixLoadStoreSched(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixloadstoresched"; this.Description = "Mixed Load/Store Address Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " mov %r15, (%r8, %rdi, 4)"; dependentLoads[1] = " mov (%r8, %rdi, 2), %r14"; dependentLoads[2] = " mov %r13, (%r8, %rdi, 4)"; dependentLoads[3] = " mov (%r8, %rdi, 2), %r12"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " mov %r15, (%r8, %rsi, 4)"; dependentLoads1[1] = " mov (%r8, %rsi, 4), %r14"; dependentLoads1[2] = " mov %r13, (%r8, %rsi, 4)"; dependentLoads1[3] = " mov (%r8, %rsi, 4), %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " str w15, [x2, w25, uxtw #2]"; dependentLoads[1] = " ldr w14, [x1, w25, uxtw #0]"; dependentLoads[2] = " str w13, [x2, w25, uxtw #2]"; dependentLoads[3] = " ldr w12, [x1, w25, uxtw #0]"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " str w15, [x2, w26, uxtw #2]"; dependentLoads1[1] = " ldr w14, [x1, w26, uxtw #0]"; dependentLoads1[2] = " str w13, [x2, w26, uxtw #2]"; dependentLoads1[3] = " ldr w12, [x1, w26, uxtw #0]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { // x5 and x6 are pointer chasing loads string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12"; string[] dependentLoads = new string[4]; dependentLoads[0] = " sd x28, (a2)"; dependentLoads[1] = " ld x29, 8(a2)"; dependentLoads[2] = " sd x30, 16(a2)"; dependentLoads[3] = " ld x31, 24(a2)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/MixStoreDivSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MixStoreDivSchedTest : UarchTest { public MixStoreDivSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixstoresched"; this.Description = "Store (Mixed Data/Address) Scheduler Capacity Test"; this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2"; this.GetFunctionCallParameters = "structIterations, list_size, B"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86Asm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } } public void GenerateX86Asm(StringBuilder sb) { string[] dependentStores = new string[4]; dependentStores[0] = " mov %rdx, (%r8, %r15, 4)"; dependentStores[1] = " mov %r15, (%r8, %rdx, 4)"; dependentStores[2] = " mov %rdx, (%r8, %r15, 4)"; dependentStores[3] = " mov %r15, (%r8, %rdx, 4)"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " mov %rdx, (%r8, %r11, 4)"; dependentStores1[1] = " mov %r11, (%r8, %rdx, 4)"; dependentStores1[2] = " mov %rdx, (%r8, %r11, 4)"; dependentStores1[3] = " mov %r11, (%r8, %rdx, 4)"; UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false); } public void GenerateArmAsm(StringBuilder sb) { string[] dependentStores = new string[4]; dependentStores[0] = " str w25, [x2, w15, uxtw #2]"; dependentStores[1] = " str w15, [x2, w25, uxtw #2]"; dependentStores[2] = " str w25, [x2, w15, uxtw #2]"; dependentStores[3] = " str w15, [x2, w25, uxtw #2]"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " str w26, [x2, w15, uxtw #2]"; dependentStores1[1] = " str w15, [x2, w26, uxtw #2]"; dependentStores1[2] = " str w26, [x2, w15, uxtw #2]"; dependentStores1[3] = " str w15, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false); } } } ================================================ FILE: AsmGen/tests/MixVec512Vec256BlockRfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixVec512Vec256BlockRfTest : UarchTest { // number of tiny registers private int nTiny; public MixVec512Vec256BlockRfTest(int low, int high, int step, int nTiny) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixvec512vec256blockrf" + nTiny; this.Description = $"Mixed zmm/ymm regs - AVX-512 only, {nTiny} 256-bit then 512-bit"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // use even numbered regs for ymm testing string initInstrs = " vmovups (%r8), %zmm1\n" + " vmovups 64(%r8), %ymm2\n" + " vmovups 128(%r8), %zmm3\n" + " vmovups 192(%r8), %ymm4\n" + " vmovups 256(%r8), %zmm5\n"; // use all zmm regs for (int i = 6; i < 32; i++) { if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n"; else initInstrs += "vmovups %zmm5, %zmm" + i + "\n"; } List instrsList = new List(); for (int i = 0; i < nTiny; i++) { int regNum = ((i & 1) == 0) ? i & 0x1F : (i + 1) & 0x1F; instrsList.Add($" vxorps %ymm2, %ymm{regNum}, %ymm{regNum}"); } for (int i = nTiny; i < this.Counts[this.Counts.Length - 1];i++) { int regNum = ((i & 1) == 0) ? i: (i + 1); regNum = (regNum + 1) & 0x1F; instrsList.Add($" vxorps %zmm1, %zmm{regNum}, %zmm{regNum}"); } string[] unrolledAdds = instrsList.ToArray(); UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs); } } } } ================================================ FILE: AsmGen/tests/MixVec512Vec256RfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class MixVec512Vec256RfTest : UarchTest { public MixVec512Vec256RfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mixvec512vec256rf"; this.Description = "Mixed zmm/ymm regs - AVX-512 only, alternating"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // use even numbered regs for ymm testing string initInstrs = " vmovups (%r8), %zmm1\n" + " vmovups 64(%r8), %ymm2\n" + " vmovups 128(%r8), %zmm3\n" + " vmovups 192(%r8), %ymm4\n" + " vmovups 256(%r8), %zmm5\n"; // use all zmm regs for (int i = 6; i < 32; i++) { if ((i & 1) == 0) initInstrs += "vmovups %ymm2, %ymm" + i + "\n"; else initInstrs += "vmovups %zmm5, %zmm" + i + "\n"; } List instrsList = new List(); for (int i = 1; i < 32; i++) { if ((i & 1) == 0) instrsList.Add($" vaddps %ymm2, %ymm{i}, %ymm{i}"); else instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}"); } string[] unrolledAdds = instrsList.ToArray(); UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs); } } } } ================================================ FILE: AsmGen/tests/MmxRfTest.cs ================================================ using System.Text; namespace AsmGen { public class MmxRfTest : UarchTest { public MmxRfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mmxrf"; this.Description = "64-bit MMX RF Capacity Test. x86 only"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, int *arr2"; this.GetFunctionCallParameters = "structIterations, A, B"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); } public void GenerateX86GccAsm(StringBuilder sb) { string initInstrs = " fsave (%r8)\n" + " movq (%rdx), %mm0\n" + " movq 8(%rdx), %mm1\n" + " movq 16(%rdx), %mm2\n" + " movq 24(%rdx), %mm3\n" + " movq 32(%rdx), %mm4\n"; string cleanupInstrs = " frstor (%r8)"; string[] unrolledAdds = new string[4]; unrolledAdds[0] = " paddw %mm0, %mm1"; unrolledAdds[1] = " paddw %mm0, %mm2"; unrolledAdds[2] = " paddw %mm0, %mm3"; unrolledAdds[3] = " paddw %mm0, %mm4"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, initInstrs: initInstrs, cleanupInstrs: cleanupInstrs); } } } ================================================ FILE: AsmGen/tests/MulSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class MulSchedTest : UarchTest { public MulSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "mulsched"; this.Description = "Scheduler, Integer Multiplies"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledMuls = new string[4]; unrolledMuls[0] = " imul %rdi, %r15"; unrolledMuls[1] = " imul %rdi, %r14"; unrolledMuls[2] = " imul %rdi, %r13"; unrolledMuls[3] = " imul %rdi, %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.aarch64) { string[] unrolledMuls = new string[4]; unrolledMuls[0] = " mul x15, x15, x25"; unrolledMuls[1] = " mul x14, x14, x25"; unrolledMuls[2] = " mul x13, x13, x25"; unrolledMuls[3] = " mul x12, x12, x25"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " mul.d $r15, $r15, $r12"; unrolledAdds[1] = " mul.d $r16, $r16, $r12"; unrolledAdds[2] = " mul.d $r17, $r17, $r12"; unrolledAdds[3] = " mul.d $r18, $r18, $r12"; string[] unrolledAdds1 = new string[4]; unrolledAdds1[0] = " mul.d $r15, $r15, $r13"; unrolledAdds1[1] = " mul.d $r16, $r16, $r13"; unrolledAdds1[2] = " mul.d $r17, $r17, $r13"; unrolledAdds1[3] = " mul.d $r18, $r18, $r13"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { string[] unrolledMuls = new string[4]; unrolledMuls[0] = " mul x30, x30, x5"; unrolledMuls[1] = " mul x29, x29, x5"; unrolledMuls[2] = " mul x28, x28, x5"; unrolledMuls[3] = " mul x31, x31, x5"; string[] unrolledMuls1 = new string[4]; unrolledMuls1[0] = " mul x30, x30, x6"; unrolledMuls1[1] = " mul x31, x31, x6"; unrolledMuls1[2] = " mul x28, x28, x6"; unrolledMuls1[3] = " mul x29, x29, x6"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledMuls, unrolledMuls1, false); } } } } ================================================ FILE: AsmGen/tests/NopLoopTest.cs ================================================ using System.Text; namespace AsmGen { public class NopLoopTest : UarchTest { /// /// /// /// must be greater than 2 /// /// public NopLoopTest(int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(3, high, step); this.Prefix = "noploop"; this.Description = $"NOP throughput for various loop sizes"; this.FunctionDefinitionParameters = "uint64_t iterations"; this.GetFunctionCallParameters = "structIterations"; this.DivideTimeByCount = true; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return false; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb); } public void GenerateX86GccAsm(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; sb.AppendLine(funcName + ":"); // count dec, jnz as instructions in the loop for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(" nop"); sb.AppendLine(" dec %rdi"); sb.AppendLine(" jnz " + funcName); sb.AppendLine(" ret"); } } public void GenerateArmAsm(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; sb.AppendLine(funcName + ":"); // count dec, jnz as instructions in the loop for (int nopIdx = 0; nopIdx < this.Counts[i] - 2; nopIdx++) sb.AppendLine(" nop"); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName); sb.AppendLine(" ret"); } } } } ================================================ FILE: AsmGen/tests/PdepSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class PdepSchedTest : UarchTest { public PdepSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "pdepsched"; this.Description = "Scheduler, PDEP"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledAdds = new string[4]; unrolledAdds[0] = " pdep %rdi, %r15, %r15"; unrolledAdds[1] = " pdep %rdi, %r14, %r14"; unrolledAdds[2] = " pdep %rdi, %r13, %r13"; unrolledAdds[3] = " pdep %rdi, %r12, %r12"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, includePtrChasingLoads: false); } } } } ================================================ FILE: AsmGen/tests/ReturnStackTest.cs ================================================ using System; using System.Text; namespace AsmGen { public class ReturnStackTest : UarchTest { public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public ReturnStackTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "returnstack"; this.Description = "Return Stack Depth Test"; this.FunctionDefinitionParameters = "uint64_t iterations"; this.GetFunctionCallParameters = "structIterations"; this.DivideTimeByCount = true; } private string GetFunctionName(int count, int depth) { return $"returnstack{count}_{depth}"; } private string GetBranchFuncName(int branchCount) { return Prefix + branchCount; } public string GetLabelName(string funcName, int part) { return funcName + "part" + part; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86GccAsm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } else if (isa == IUarchTest.ISA.mips64) { GenerateMipsAsm(sb); } else if (isa == IUarchTest.ISA.riscv) { GenerateRiscvAsm(sb); } } public void GenerateX86GccAsm(StringBuilder sb) { for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++) { int callDepth = this.Counts[countIdx]; string topLevelFunctionLabel = this.Prefix + callDepth; sb.AppendLine($"{topLevelFunctionLabel}:"); sb.AppendLine(" xor %rax, %rax"); sb.AppendLine($"{topLevelFunctionLabel}_loop:"); sb.AppendLine($" call " + GetFunctionName(callDepth, 0)); sb.AppendLine($" dec %rdi"); sb.AppendLine($" jne {topLevelFunctionLabel}_loop"); sb.AppendLine(" ret"); // generate a batch of functions so we aren't returning to the same address // otherwise a simple predictor will suffice for (int callIdx = 0; callIdx < callDepth; callIdx++) { string funcName = GetFunctionName(callDepth, callIdx); sb.AppendLine($".global {funcName}"); sb.AppendLine(".align 128"); // https://github.com/clamchowder/Microbenchmarks/issues/14 sb.AppendLine($"{funcName}:"); if (callIdx < callDepth - 1) { sb.AppendLine($" add %rdi, %rax"); sb.AppendLine(" call " + GetFunctionName(callDepth, callIdx + 1)); } sb.AppendLine(".align 128"); sb.AppendLine(" ret"); } } } public void GenerateArmAsm(StringBuilder sb) { for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++) { int callDepth = this.Counts[countIdx]; string topLevelFunctionLabel = this.Prefix + callDepth; sb.AppendLine($"{topLevelFunctionLabel}:"); sb.AppendLine(" sub sp, sp, #0x20"); sb.AppendLine(" stp x29, x30, [sp, #0x10]"); sb.AppendLine(" eor x3, x3, x3"); sb.AppendLine($"{topLevelFunctionLabel}_loop:"); sb.AppendLine($" bl " + GetFunctionName(callDepth, 0)); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine($" cbnz x0, {topLevelFunctionLabel}_loop"); sb.AppendLine(" ldp x29, x30, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x20"); sb.AppendLine(" ret"); for (int callIdx = 0; callIdx < callDepth; callIdx++) { string funcName = GetFunctionName(callDepth, callIdx); sb.AppendLine($".global {funcName}"); sb.AppendLine($"{funcName}:"); sb.AppendLine($" add x3, x3, x0"); if (callIdx < callDepth - 1) { // 'bl' is like x86 'call', except it's like the kid that falls asleep in the middle of class // it doesn't push the return address, so you have to do that yourself sb.AppendLine(" sub sp, sp, #0x20"); sb.AppendLine(" stp x29, x30, [sp, #0x10]"); sb.AppendLine(" bl " + GetFunctionName(callDepth, callIdx + 1)); sb.AppendLine(" ldp x29, x30, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x20"); } sb.AppendLine(" ret"); } } } public void GenerateMipsAsm(StringBuilder sb) { for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++) { int callDepth = this.Counts[countIdx]; string topLevelFunctionLabel = this.Prefix + callDepth; sb.AppendLine($"{topLevelFunctionLabel}:"); // top level function runs for specified number of iterations sb.AppendLine(" xor $r12, $r12, $r12"); sb.AppendLine(" xor $r13, $r13, $r13"); sb.AppendLine(" addi.d $r12, $r12, 1"); sb.AppendLine(" addi.d $r13, $r13, 8"); sb.AppendLine(" sub.d $sp, $sp, $r13"); sb.AppendLine(" st.d $r1, $sp, 0"); sb.AppendLine($"{topLevelFunctionLabel}_loop:"); // mips stack grows down sb.AppendLine($" bl " + GetFunctionName(callDepth, 0)); sb.AppendLine(" sub.d $r4, $r4, $r12"); sb.AppendLine($" bnez $r4, {topLevelFunctionLabel}_loop"); sb.AppendLine(" ld.d $r1, $sp, 0"); sb.AppendLine(" add.d $sp, $sp, $r13"); sb.AppendLine(" jr $r1"); // generate the dummy functions for (int callIdx = 0; callIdx < callDepth; callIdx++) { string funcName = GetFunctionName(callDepth, callIdx); sb.AppendLine($".global {funcName}"); sb.AppendLine($"{funcName}:"); if (callIdx < callDepth - 1) { sb.AppendLine(" sub.d $sp, $sp, $r13"); sb.AppendLine(" st.d $r1, $sp, 0"); // save return address sb.AppendLine(" bl " + GetFunctionName(callDepth, callIdx + 1)); sb.AppendLine(" ld.d $r1, $sp, 0"); // load return address sb.AppendLine(" add.d $sp, $sp, $r13"); } sb.AppendLine(" jr $r1"); } } } public void GenerateRiscvAsm(StringBuilder sb) { for (int countIdx = 0; countIdx < this.Counts.Length; countIdx++) { int callDepth = this.Counts[countIdx]; string topLevelFunctionLabel = this.Prefix + callDepth; sb.AppendLine($"{topLevelFunctionLabel}:"); // top level function runs for specified number of iterations // iteration count in x10 sb.AppendLine(" addi sp, sp, -16"); sb.AppendLine(" sd ra, (sp)"); sb.AppendLine($"{topLevelFunctionLabel}_loop:"); sb.AppendLine($" jal " + GetFunctionName(callDepth, 0)); sb.AppendLine(" addi x10, x10, -1"); sb.AppendLine($" bge x10, x0, {topLevelFunctionLabel}_loop"); sb.AppendLine(" ld ra, (sp)"); sb.AppendLine(" addi sp, sp, 16"); sb.AppendLine(" ret"); // generate the dummy functions for (int callIdx = 0; callIdx < callDepth; callIdx++) { string funcName = GetFunctionName(callDepth, callIdx); sb.AppendLine($".global {funcName}"); sb.AppendLine($"{funcName}:"); if (callIdx < callDepth - 1) { sb.AppendLine(" addi sp, sp, -16"); // keep stack pointer 16B aligned even though we only save a 8B reg sb.AppendLine(" sd ra, (sp)"); // save return address sb.AppendLine(" jal " + GetFunctionName(callDepth, callIdx + 1)); sb.AppendLine(" ld ra, (sp)"); // load return address sb.AppendLine(" addi sp, sp, 16"); } sb.AppendLine(" ret"); } } } } } ================================================ FILE: AsmGen/tests/RobTest.cs ================================================ using System.Text; namespace AsmGen { public class RobTest : UarchTest { private string[] nops; private bool initialDependentBranch; public RobTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "rob" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Reorder Buffer Test" + (initialDependentBranch ? " preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.nops = new string[] { "nop" }; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/RorSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class RorSchedTest : UarchTest { public RorSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "rorsched"; this.Description = "Scheduler, Integer Rotate by Immediate (1)"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string postLoadInstrs = " mov %rdi, %r15"; string postLoadInstrs2 = " mov %rsi, %r15"; string[] unrolledInstrs = new string[1]; unrolledInstrs[0] = " ror $1, %r15"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs2, includePtrChasingLoads: false); } } } } ================================================ FILE: AsmGen/tests/ShlSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class ShlSchedTest : UarchTest { public ShlSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "shlsched"; this.Description = "Scheduler, Integer Shift by Immediate (1)"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string postLoadInstrs = " mov %rdi, %r15"; string postLoadInstrs2 = " mov %rsi, %r15"; string[] unrolledInstrs = new string[1]; unrolledInstrs[0] = " shl $1, %r15"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledInstrs, unrolledInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs2, includePtrChasingLoads: false); } } } } ================================================ FILE: AsmGen/tests/StoreDataDivNsqTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreDataDivNsqTest : UarchTest { public StoreDataDivNsqTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storedatadivnsq"; this.Description = "Store Data Scheduler, using DIVs to block retirement"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // idiv puts remainder in RDX string[] dependentStores = new string[4]; dependentStores[0] = " mov %rdx, (%r8, %r15, 4)"; dependentStores[1] = " mov %rdx, (%r8, %r15, 4)"; dependentStores[2] = " mov %rdx, (%r8, %r15, 4)"; dependentStores[3] = " mov %rdx, (%r8, %r15, 4)"; string[] independentStores = new string[4]; independentStores[0] = " mov %r14, (%r8, %r11, 4)"; independentStores[1] = " mov %r14, (%r8, %r11, 4)"; independentStores[2] = " mov %r14, (%r8, %r11, 4)"; independentStores[3] = " mov %r14, (%r8, %r11, 4)"; UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentStores = new string[1]; dependentStores[0] = " str w25, [x2, w15, uxtw #2]"; string[] independentStores = new string[1]; independentStores[0] = " str w15, [x2, w15, uxtw #2]"; UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores); } } } } ================================================ FILE: AsmGen/tests/StoreDataNsqTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreDataNsq : UarchTest { public StoreDataNsq(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storedatansq"; this.Description = "Store Data Scheduler, excluding NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; // if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " mov %rdi, (%r8)"; dependentLoads[1] = " mov %rdi, 8(%r8)"; dependentLoads[2] = " mov %rdi, 16(%r8)"; dependentLoads[3] = " mov %rdi, 24(%r8)"; string[] independentLoads = new string[4]; independentLoads[0] = " mov %r14, (%r8)"; independentLoads[1] = " mov %r14, 8(%r8)"; independentLoads[2] = " mov %r14, 16(%r8)"; independentLoads[3] = " mov %r14, 24(%r8)"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentLoads, independentLoads); } } } } ================================================ FILE: AsmGen/tests/StoreDataSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreDataSchedTest : UarchTest { public StoreDataSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storedatasched"; this.Description = "Store Data Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " mov %rdi, (%r8)"; dependentLoads[1] = " mov %rdi, 8(%r8)"; dependentLoads[2] = " mov %rdi, 16(%r8)"; dependentLoads[3] = " mov %rdi, 24(%r8)"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " mov %rsi, (%r8)"; dependentLoads1[1] = " mov %rsi, 8(%r8)"; dependentLoads1[2] = " mov %rsi, 16(%r8)"; dependentLoads1[3] = " mov %rsi, 24(%r8)"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentLoads = new string[4]; dependentLoads[0] = " str w25, [x2, 8]"; dependentLoads[1] = " str w25, [x2, 16]"; dependentLoads[2] = " str w25, [x2, 24]"; dependentLoads[3] = " str w25, [x2, 32]"; string[] dependentLoads1 = new string[4]; dependentLoads1[0] = " str w26, [x2, 8]"; dependentLoads1[1] = " str w26, [x2, 16]"; dependentLoads1[2] = " str w26, [x2, 24]"; dependentLoads1[3] = " str w26, [x2, 32]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.mips64) { string postLoadInstrs1 = " andi $r19, $r12, 0xF\n add.d $r19, $r19, $r6"; string[] dependentLoads = new string[4]; dependentLoads[0] = " ld.d $r15, $r19, 0"; dependentLoads[1] = " ld.d $r16, $r19, 8"; dependentLoads[2] = " ld.d $r17, $r19, 12"; dependentLoads[3] = " ld.d $r18, $r19, 16"; string postLoadInstrs2 = " andi $r19, $r13, 0xF\n add.d $r19, $r19, $r6"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs( sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, null, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } else if (isa == IUarchTest.ISA.riscv) { // x5 and x6 are pointer chasing loads string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12"; string[] dependentLoads = new string[4]; dependentLoads[0] = " ld x28, (x7)"; dependentLoads[1] = " ld x29, 8(x7)"; dependentLoads[2] = " ld x30, 16(x7)"; dependentLoads[3] = " ld x31, 24(x7)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/StoreDivNsqTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreDivNsqTest : UarchTest { public StoreDivNsqTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storedivnsq"; this.Description = "Store Scheduler, using DIVs to block retirement, excluding NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // idiv puts remainder in RDX string[] dependentStores = new string[4]; dependentStores[0] = " mov %r15w, (%r8, %rdx, 2)"; dependentStores[1] = " mov %r15w, 2(%r8, %rdx, 2)"; dependentStores[2] = " mov %r15w, 4(%r8, %rdx, 2)"; dependentStores[3] = " mov %r15w, 6(%r8, %rdx, 2)"; string[] indepStores = new string[4]; indepStores[0] = " mov %r11w, (%r8)"; indepStores[1] = " mov %r11w, 2(%r8)"; indepStores[2] = " mov %r11w, 4(%r8)"; indepStores[3] = " mov %r11w, 6(%r8)"; UarchTestHelpers.GenerateX86AsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, indepStores); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentStores = new string[1]; dependentStores[0] = " str w15, [x2, w25, uxtw #2]"; string[] independentStores = new string[1]; independentStores[0] = " str w15, [x2, w15, uxtw #2]"; UarchTestHelpers.GenerateArmAsmDivNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, dependentStores, independentStores); } } } } ================================================ FILE: AsmGen/tests/StoreDivSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreDivSchedTest : UarchTest { public StoreDivSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storedivsched"; this.Description = "Store Address Scheduler Capacity Test, using divs to block retirement"; this.FunctionDefinitionParameters = "uint64_t iterations, int count, int *arr2"; this.GetFunctionCallParameters = "structIterations, list_size, B"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { GenerateX86Asm(sb); } else if (isa == IUarchTest.ISA.aarch64) { GenerateArmAsm(sb); } } public void GenerateX86Asm(StringBuilder sb) { string[] dependentStores = new string[4]; dependentStores[0] = " mov %r15, (%r8, %rdx, 4)"; dependentStores[1] = " mov %r15, (%r8, %rdx, 4)"; dependentStores[2] = " mov %r15, (%r8, %rdx, 4)"; dependentStores[3] = " mov %r15, (%r8, %rdx, 4)"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " mov %r11, (%r8, %rdx, 4)"; dependentStores1[1] = " mov %r11, (%r8, %rdx, 4)"; dependentStores1[2] = " mov %r11, (%r8, %rdx, 4)"; dependentStores1[3] = " mov %r11, (%r8, %rdx, 4)"; // instead of using pointer chasing loads, use a nasty block of chained integer divisions to block retirement // some older/less capable architectures will not reorder loads ahead of stores with unknown addresses, // which breaks the usual technique UarchTestHelpers.GenerateX86AsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false); } public void GenerateArmAsm(StringBuilder sb) { string[] dependentStores = new string[4]; dependentStores[0] = " str w15, [x2, w25, uxtw #2]"; dependentStores[1] = " str w15, [x2, w25, uxtw #2]"; dependentStores[2] = " str w15, [x2, w25, uxtw #2]"; dependentStores[3] = " str w15, [x2, w25, uxtw #2]"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " str w15, [x2, w26, uxtw #2]"; dependentStores1[1] = " str w15, [x2, w26, uxtw #2]"; dependentStores1[2] = " str w15, [x2, w26, uxtw #2]"; dependentStores1[3] = " str w15, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmDivStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, false); } } } ================================================ FILE: AsmGen/tests/StoreNsq.cs ================================================ using System.Text; namespace AsmGen { public class StoreNsq : UarchTest { public StoreNsq(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storensq"; this.Description = "Store Address Scheduler, Excluding any NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) { string[] depStores = new string[4]; depStores[0] = " str w15, [x2, w25, uxtw #2]"; depStores[1] = " str w14, [x2, w25, uxtw #2]"; depStores[2] = " str w13, [x2, w25, uxtw #2]"; depStores[3] = " str w12, [x2, w25, uxtw #2]"; string[] indepStores = new string[4]; indepStores[0] = " str w15, [x2, w26, uxtw #2]"; indepStores[1] = " str w14, [x2, w26, uxtw #2]"; indepStores[2] = " str w13, [x2, w26, uxtw #2]"; indepStores[3] = " str w12, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.Counts[this.Counts.Length - 1], this.Counts, this.Prefix, depStores, indepStores); } } } } ================================================ FILE: AsmGen/tests/StoreSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class StoreSchedTest : UarchTest { public StoreSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "storesched"; this.Description = "Store Address Scheduler"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] dependentStores = new string[4]; dependentStores[0] = " mov %r15, (%r8, %rdi, 4)"; dependentStores[1] = " mov %r14, (%r8, %rdi, 4)"; dependentStores[2] = " mov %r13, (%r8, %rdi, 4)"; dependentStores[3] = " mov %r12, (%r8, %rdi, 4)"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " mov %r15, (%r8, %rsi, 4)"; dependentStores1[1] = " mov %r14, (%r8, %rsi, 4)"; dependentStores1[2] = " mov %r13, (%r8, %rsi, 4)"; dependentStores1[3] = " mov %r12, (%r8, %rsi, 4)"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.aarch64) { string[] dependentStores = new string[4]; dependentStores[0] = " str w15, [x2, w25, uxtw #2]"; dependentStores[1] = " str w14, [x2, w25, uxtw #2]"; dependentStores[2] = " str w13, [x2, w25, uxtw #2]"; dependentStores[3] = " str w12, [x2, w25, uxtw #2]"; string[] dependentStores1 = new string[4]; dependentStores1[0] = " str w15, [x2, w26, uxtw #2]"; dependentStores1[1] = " str w14, [x2, w26, uxtw #2]"; dependentStores1[2] = " str w13, [x2, w26, uxtw #2]"; dependentStores1[3] = " str w12, [x2, w26, uxtw #2]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentStores, dependentStores1, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { // x5 and x6 are pointer chasing loads string postLoadInstrs1 = " andi x7, x5, 0xF\n add x7, x7, x12"; string postLoadInstrs2 = " andi x7, x6, 0xF\n add x7, x7, x12"; string[] dependentLoads = new string[4]; dependentLoads[0] = " sd x28, (a2)"; dependentLoads[1] = " sd x29, 8(a2)"; dependentLoads[2] = " sd x30, 16(a2)"; dependentLoads[3] = " sd x31, 24(a2)"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, dependentLoads, dependentLoads, includePtrChasingLoads: true, postLoadInstrs1: postLoadInstrs1, postLoadInstrs2: postLoadInstrs2); } } } } ================================================ FILE: AsmGen/tests/Stq128Test.cs ================================================ using System.Text; namespace AsmGen { public class Stq128Test : UarchTest { private bool initialDependentBranch; public Stq128Test(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "stq128" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Store Queue with 128-bit stores" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " movups (%rdx), %xmm1"; string[] unrolledStores = new string[4]; unrolledStores[0] = " movaps %xmm1, (%r8)"; unrolledStores[1] = " movaps %xmm1, (%r8)"; unrolledStores[2] = " movaps %xmm1, (%r8)"; unrolledStores[3] = " movaps %xmm1, (%r8)"; UarchTestHelpers.GenerateX86AsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, initInstrs: initInstrs, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.aarch64) { string initInstrs = " ldr q0, [x1]"; string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] unrolledStores = new string[4]; unrolledStores[0] = " str q0, [x2]"; unrolledStores[1] = " str q0, [x2]"; unrolledStores[2] = " str q0, [x2]"; unrolledStores[3] = " str q0, [x2]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.riscv) { string initInstrs = " mv t6, x0\n addi t6, t6, 16\n vsetvli t5, t6, e32\n vlw.v v0, (a1)"; string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : string.Empty; postLoadInstrs += "\n mv t6, a2"; string[] unrolledStores = new string[1]; unrolledStores[0] = " vsw.v v0, (t6)\n addi t6, t6, 64"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false, initInstrs: initInstrs, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/Stq512Test.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class Stq512Test : UarchTest { private bool differentLines; public Stq512Test(int low, int high, int step, bool differentLines) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "stq512" + (differentLines ? "dl" : string.Empty); this.Description = "Store Queue with 512-bit stores - AVX-512 only"; if (differentLines) this.Description += " with multiple lines"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.differentLines = differentLines; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string initInstrs = " vmovaps (%r8), %zmm0\n vmovaps %zmm0, %zmm1"; string[] unrolledStores; if (differentLines) { List unrolledStoresList = new List(); int maxOffset = 512, currentOffset = 0; for (int i = 0; i < this.Counts[this.Counts.Length - 1]; i++) { string loadOffset = currentOffset > 0 ? currentOffset.ToString() : string.Empty; string nextInstr = $" vmovaps %zmm0, {loadOffset}(%r8)"; unrolledStoresList.Add(nextInstr); if (currentOffset >= maxOffset) { currentOffset = 0; } else currentOffset += 64; unrolledStoresList.Add(" vmovaps %zmm0, (%r8)"); } unrolledStores = unrolledStoresList.ToArray(); } else { unrolledStores = new string[2]; unrolledStores[0] = " vmovaps %zmm0, (%r8)"; unrolledStores[1] = " vmovaps %zmm1, (%r8)"; } UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, initInstrs: initInstrs); } } } } ================================================ FILE: AsmGen/tests/StqTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class StqTest : UarchTest { private bool initialDependentBranch; private bool spaced; public StqTest(int low, int high, int step, bool initialDependentBranch, bool spaced) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "stq" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Store Queue" + (initialDependentBranch ? ", preceded by independent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; this.spaced = spaced; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] unrolledStores; string postLoadInstrs = ""; if (spaced) { postLoadInstrs = "mov %r8, %r11"; List storeInstrs = new List(); for (int i = 0; i < this.Counts[Counts.Length - 1]; i++) { // Send to different cache lines storeInstrs.Add(" mov %r15, (%r11)\n add $64, %r11"); } unrolledStores = storeInstrs.ToArray(); } else { unrolledStores = new string[4]; unrolledStores[0] = " mov %r15, (%r8)"; unrolledStores[1] = " mov %r14, (%r8)"; unrolledStores[2] = " mov %r13, (%r8)"; unrolledStores[3] = " mov %r12, (%r8)"; } UarchTestHelpers.GenerateX86AsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] unrolledStores = new string[4]; unrolledStores[0] = " str x15, [x2]"; unrolledStores[1] = " str x14, [x2]"; unrolledStores[2] = " str x13, [x2]"; unrolledStores[3] = " str x12, [x2]"; UarchTestHelpers.GenerateArmAsmStructureTestFuncs( sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { string[] unrolledStores = new string[4]; unrolledStores[0] = " st.d $r15, $r6, 0"; unrolledStores[1] = " st.d $r16, $r6, 0"; unrolledStores[2] = " st.d $r17, $r6, 0"; unrolledStores[3] = " st.d $r18, $r6, 0"; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, includePtrChasingLoads: false); } else if (isa == IUarchTest.ISA.riscv) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetRiscvDependentBranch(this.Prefix) : null; string[] unrolledStores; if (this.spaced) { List stores = new List(); for (int i = 0; i < 32; i++) { stores.Add($" sd x28, {i * 16}(x12)"); } unrolledStores = stores.ToArray(); } else { unrolledStores = new string[4]; unrolledStores[0] = " sd x28, (x12)"; unrolledStores[1] = " sd x29, 8(x12)"; unrolledStores[2] = " sd x30, 16(x12)"; unrolledStores[3] = " sd x31, 24(x12)"; } UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledStores, unrolledStores, false, postLoadInstrs1: postLoadInstrs, postLoadInstrs2: postLoadInstrs); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetRiscvDependentBranchTarget(this.Prefix)); } } } } ================================================ FILE: AsmGen/tests/TakenBranchBufferTest.cs ================================================ using System.Text; namespace AsmGen { public class TakenBranchBufferTest : UarchTest { private bool initialDependentBranch; public TakenBranchBufferTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "tbb" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Taken Branch Buffer Test (taken branches pending retire)" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) GenerateX86GccAsm(sb); else if (isa == IUarchTest.ISA.aarch64) GenerateArmAsm(sb); } public void GenerateX86GccAsm(StringBuilder sb) { for (int i = 0; i < Counts.Length; i++) { string funcName = Prefix + Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r15"); sb.AppendLine(" push %r14"); sb.AppendLine(" push %r13"); sb.AppendLine(" push %r12"); sb.AppendLine(" push %r11"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %r15, %r15"); sb.AppendLine(" mov $0x1, %r14"); sb.AppendLine(" mov $0x2, %r13"); sb.AppendLine(" mov $0x3, %r12"); sb.AppendLine(" mov $0x4, %r11"); sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_edi_target{fillerIdx}"; sb.AppendLine($" jmp {jumpLabel}"); sb.AppendLine(".align 16"); if (fillerIdx % 2 == 0) sb.AppendLine(" nop"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_esi_target{fillerIdx}"; sb.AppendLine($" jmp {jumpLabel}"); // try to space the jumps out a bit sb.AppendLine(".align 16"); if (fillerIdx % 2 == 0) sb.AppendLine(" nop"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %r11"); sb.AppendLine(" pop %r12"); sb.AppendLine(" pop %r13"); sb.AppendLine(" pop %r14"); sb.AppendLine(" pop %r15"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } public void GenerateArmAsm(StringBuilder sb) { string dependentBranch = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; for (int i = 0; i < Counts.Length; i++) { string funcName = Prefix + Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov x14, 2"); sb.AppendLine(" mov x13, 3"); sb.AppendLine(" mov x12, 4"); sb.AppendLine(" mov x11, 5"); sb.AppendLine(" mov x10, 6"); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w25_target{fillerIdx}"; sb.AppendLine($" b {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); if (this.initialDependentBranch) sb.AppendLine(dependentBranch); for (int fillerIdx = 0; fillerIdx < Counts[i]; fillerIdx++) { string jumpLabel = $"{funcName}_w26_target{fillerIdx}"; sb.AppendLine($" b {jumpLabel}"); sb.AppendLine($"{jumpLabel}:"); } sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } } } ================================================ FILE: AsmGen/tests/TakenJumpSchedTest.cs ================================================ using System.Text; namespace AsmGen { public class TakenJumpSchedTest : UarchTest { public TakenJumpSchedTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "takenjumpsched"; this.Description = "Scheduler, Taken Jumps"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; // if (isa == IUarchTest.ISA.mips64) return true; // if (isa == IUarchTest.ISA.riscv) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { for (int i = 0; i < this.Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" push %rsi"); sb.AppendLine(" push %rdi"); sb.AppendLine(" push %r8"); sb.AppendLine(" push %rcx"); sb.AppendLine(" push %rdx"); // arguments are in RDI, RSI, RDX, RCX, R8, and R9 // move them into familiar windows argument regs (rcx, rdx, r8) sb.AppendLine(" mov %rdx, %r8"); // r8 <- rdx sb.AppendLine(" mov %rsi, %rdx"); // rdx <- rsi sb.AppendLine(" mov %rdi, %rcx"); // rcx <- rdi sb.AppendLine(" xor %rdi, %rdi"); sb.AppendLine(" mov $0x40, %esi"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" mov (%rdx,%rdi,4), %edi"); for (int fillerIdx = 0;fillerIdx < this.Counts[i]; fillerIdx++) { string labelName = funcName + "part" + fillerIdx; sb.AppendLine(" cmp %rdi, %rsi"); sb.AppendLine(" jne " + labelName); sb.AppendLine(" inc %rax"); sb.AppendLine(".align 16"); sb.AppendLine(labelName + ":"); } sb.AppendLine(" mov (%rdx,%rsi,4), %esi"); sb.AppendLine("lfence"); sb.AppendLine(" dec %rcx"); sb.AppendLine(" jne " + funcName + "start"); sb.AppendLine(" pop %rdx"); sb.AppendLine(" pop %rcx"); sb.AppendLine(" pop %r8"); sb.AppendLine(" pop %rdi"); sb.AppendLine(" pop %rsi"); sb.AppendLine(" ret\n\n"); } } else if (isa == IUarchTest.ISA.aarch64) { for (int i = 0; i < this.Counts.Length; i++) { string funcName = this.Prefix + this.Counts[i]; // args in x0, x1 sb.AppendLine("\n" + funcName + ":"); sb.AppendLine(" sub sp, sp, #0x50"); sb.AppendLine(" stp x14, x15, [sp, #0x10]"); sb.AppendLine(" stp x12, x13, [sp, #0x20]"); sb.AppendLine(" stp x10, x11, [sp, #0x30]"); sb.AppendLine(" stp x25, x26, [sp, #0x40]"); sb.AppendLine(" mov x15, 1"); sb.AppendLine(" mov w25, 0x0"); sb.AppendLine(" mov w26, 0x40"); sb.AppendLine("\n" + funcName + "start:"); sb.AppendLine(" ldr w25, [x1, w25, uxtw #2]"); // current = A[current] for (int nopIdx = 0; nopIdx < this.Counts[i]; nopIdx++) { string labelName = funcName + "part" + nopIdx; sb.AppendLine(" cmp w25, w26"); sb.AppendLine(" b.ne " + labelName); sb.AppendLine(" add x15, x15, 1"); sb.AppendLine(" nop\n nop\n nop"); sb.AppendLine(labelName + ":"); } sb.AppendLine(" ldr w26, [x1, w26, uxtw #2]"); sb.AppendLine(" dsb sy"); sb.AppendLine(" isb sy"); sb.AppendLine(" sub x0, x0, 1"); sb.AppendLine(" cbnz x0, " + funcName + "start"); sb.AppendLine(" ldp x25, x26, [sp, #0x40]"); sb.AppendLine(" ldp x10, x11, [sp, #0x30]"); sb.AppendLine(" ldp x12, x13, [sp, #0x20]"); sb.AppendLine(" ldp x14, x15, [sp, #0x10]"); sb.AppendLine(" add sp, sp, #0x50"); sb.AppendLine(" ret\n\n"); } } else if (isa == IUarchTest.ISA.riscv) { // todo string[] unrolledAdds = new string[4]; unrolledAdds[0] = " mul x30, x30, x5"; unrolledAdds[1] = " mul x29, x29, x5"; unrolledAdds[2] = " mul x28, x28, x5"; unrolledAdds[3] = " mul x31, x31, x5"; string[] unrolledAdds1 = new string[4]; unrolledAdds1[0] = " mul x30, x30, x6"; unrolledAdds1[1] = " mul x31, x31, x6"; unrolledAdds1[2] = " mul x28, x28, x6"; unrolledAdds1[3] = " mul x29, x29, x6"; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds1, false); } } } } ================================================ FILE: AsmGen/tests/Vec512RfTest.cs ================================================ using System.Collections.Generic; using System.Text; namespace AsmGen { public class Vec512RfTest : UarchTest { public Vec512RfTest(int low, int high, int step) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "vec512rf"; this.Description = "Vector (512-bit packed fp) RF Test - AVX-512 only"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { // it's ok, the ptr chasing arr should be way bigger than this string initInstrs = " vmovups (%r8), %zmm1\n" + " vmovups 64(%r8), %zmm2\n" + " vmovups 128(%r8), %zmm3\n" + " vmovups 192(%r8), %zmm4\n" + " vmovups 256(%r8), %zmm5\n"; // use all zmm regs for (int i = 6; i < 32; i++) { initInstrs += "vmovups %zmm5, %zmm" + i + "\n"; } List instrsList = new List(); for (int i = 1; i < 32; i++) { instrsList.Add($" vaddps %zmm1, %zmm{i}, %zmm{i}"); } string[] unrolledAdds = instrsList.ToArray(); UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, unrolledAdds, unrolledAdds, false, initInstrs); } } } } ================================================ FILE: AsmGen/tests/VecMulNsq.cs ================================================ using System.Text; namespace AsmGen { public class VecMulNsq : UarchTest { private int totalOps; public VecMulNsq(int low, int high, int step, int totalOps) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "vecmulnsq" + totalOps; this.Description = "Vector Integer Multiply, excluding possible NSQ"; this.FunctionDefinitionParameters = "uint64_t iterations, int *arr, float *floatArr"; this.GetFunctionCallParameters = "structIterations, A, fpArr"; this.DivideTimeByCount = false; this.totalOps = totalOps; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.amd64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string postLoadInstrs = " mov %rdi, %r15\n add %r8, %r15\n movdqu (%r15), %xmm1"; string initInstrs = " movdqu (%r8), %xmm2"; string[] depInstrs = new string[4]; depInstrs[0] = " pmulld %xmm1, %xmm0"; depInstrs[1] = " pmulld %xmm1, %xmm3"; depInstrs[2] = " pmulld %xmm1, %xmm4"; depInstrs[3] = " pmulld %xmm1, %xmm5"; string[] indepInstrs = new string[2]; indepInstrs[0] = " pmulld %xmm2, %xmm6"; indepInstrs[1] = " pmulld %xmm2, %xmm7"; UarchTestHelpers.GenerateX86AsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs1 = " ldr s16, [x2, w25, uxtw #2]"; string initInstrs = " ldr s15, [x2]"; string[] depInstrs = new string[4]; depInstrs[0] = " fadd s0, s0, s16"; depInstrs[1] = " fadd s1, s1, s16"; depInstrs[2] = " fadd s2, s2, s16"; depInstrs[3] = " fadd s3, s3, s16"; string[] indepInstrs = new string[4]; indepInstrs[0] = " fadd s17, s17, s15"; indepInstrs[1] = " fadd s18, s18, s15"; indepInstrs[2] = " fadd s19, s19, s15"; indepInstrs[3] = " fadd s20, s20, s15"; UarchTestHelpers.GenerateArmAsmNsqTestFuncs(sb, this.totalOps, this.Counts, this.Prefix, depInstrs, indepInstrs, false, initInstrs, postLoadInstrs: postLoadInstrs1); } } } } ================================================ FILE: AsmGen/tests/ZeroRobTest.cs ================================================ using System.Text; namespace AsmGen { public class ZeroRobTest : UarchTest { private bool initialDependentBranch; public ZeroRobTest(int low, int high, int step, bool initialDependentBranch) { this.Counts = UarchTestHelpers.GenerateCountArray(low, high, step); this.Prefix = "zerorob" + (initialDependentBranch ? "db" : string.Empty); this.Description = "Reorder Buffer Test with Zeroing Idioms" + (initialDependentBranch ? ", preceded by dependent branch" : string.Empty); this.FunctionDefinitionParameters = "uint64_t iterations, int *arr"; this.GetFunctionCallParameters = "structIterations, A"; this.DivideTimeByCount = false; this.initialDependentBranch = initialDependentBranch; } public override bool SupportsIsa(IUarchTest.ISA isa) { if (this.initialDependentBranch && isa != IUarchTest.ISA.aarch64) return false; if (isa == IUarchTest.ISA.amd64) return true; if (isa == IUarchTest.ISA.aarch64) return true; if (isa == IUarchTest.ISA.mips64) return true; return false; } public override void GenerateAsm(StringBuilder sb, IUarchTest.ISA isa) { if (isa == IUarchTest.ISA.amd64) { string[] nops = new string[] { " xor %r11, %r11" }; UarchTestHelpers.GenerateX86AsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true); } else if (isa == IUarchTest.ISA.aarch64) { string postLoadInstrs = this.initialDependentBranch ? UarchTestHelpers.GetArmDependentBranch(this.Prefix) : null; string[] nops = new string[] { " mov x10, 0" }; UarchTestHelpers.GenerateArmAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, true); if (this.initialDependentBranch) sb.AppendLine(UarchTestHelpers.GetArmDependentBranchTarget(this.Prefix)); } else if (isa == IUarchTest.ISA.mips64) { string[] nops = new string[] { " move $r14, $r0" }; UarchTestHelpers.GenerateMipsAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true); } else if (isa == IUarchTest.ISA.riscv) { string[] nops = new string[] { " mov $r14, $r0" }; UarchTestHelpers.GenerateRiscvAsmStructureTestFuncs(sb, this.Counts, this.Prefix, nops, nops, includePtrChasingLoads: true); } } } } ================================================ FILE: CoherencyLatency/CoherencyLatency.cpp ================================================ #include #include #ifndef __MINGW32__ #include #else #include #endif #include #include #define ITERATIONS 10000000; float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter); float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter); DWORD WINAPI LatencyTestThread(LPVOID param); DWORD WINAPI ReadLatencyTestThread(LPVOID param); LONG64* bouncyBase; LONG64* bouncy; typedef struct LatencyThreadData { uint64_t start; // initial value to write into target uint64_t iterations; // number of iterations to run LONG64 *target; // value to bounce between threads, init with start - 1 LONG64 *readTarget; // for read test, memory location to read from (owned by other core) DWORD affinityMask; // thread affinity mask to set } LatencyData; int main(int argc, char *argv[]) { SYSTEM_INFO sysInfo; DWORD numProcs; float** latencies; uint64_t iter = ITERATIONS; int offsets = 1; float (*test)(unsigned int, unsigned int, uint64_t) = RunTest; for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char* arg = argv[argIdx] + 1; if (_strnicmp(arg, "iterations", 10) == 0) { argIdx++; iter = atoi(argv[argIdx]); fprintf(stderr, "%lu iterations requested\n", iter); } else if (_strnicmp(arg, "bounce", 6) == 0) { fprintf(stderr, "Bouncy\n"); } else if (_strnicmp(arg, "owned", 5) == 0) { test = RunOwnedTest; fprintf(stderr, "Using separate cache lines for each thread to write to\n"); } else if (_strnicmp(arg, "offset", 6) == 0) { argIdx++; offsets = atoi(argv[argIdx]); fprintf(stderr, "Offsets: %d\n", offsets); } } } bouncyBase = (LONG64*)_aligned_malloc(64 * offsets, 4096); bouncy = bouncyBase; if (bouncy == NULL) { fprintf(stderr, "Could not allocate aligned mem\n"); } GetSystemInfo(&sysInfo); numProcs = sysInfo.dwNumberOfProcessors; fprintf(stderr, "Number of CPUs: %u\n", numProcs); latencies = (float **)malloc(sizeof(float*) * offsets); if (latencies == NULL) { fprintf(stderr, "couldn't allocate result array\n"); return 0; } for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) { bouncy = (LONG64*)((char*)bouncyBase + offsetIdx * 64); latencies[offsetIdx] = (float*)malloc(sizeof(float) * numProcs * numProcs); float* latenciesPtr = latencies[offsetIdx]; // Run all to all, skipping testing a core against itself ofc // technically can skip the other way around (start j = i + 1) but meh for (DWORD i = 0; i < numProcs; i++) { for (DWORD j = 0; j < numProcs; j++) { latenciesPtr[j + i * numProcs] = i == j ? 0 : test(i, j, iter); } } } for (DWORD offsetIdx = 0; offsetIdx < offsets; offsetIdx++) { printf("Cache line offset: %d\n", offsetIdx); float* latenciesPtr = latencies[offsetIdx]; // print thing to copy to excel for (DWORD i = 0; i < numProcs; i++) { for (DWORD j = 0; j < numProcs; j++) { if (j != 0) printf(","); if (j == i) printf("x"); else printf("%f", latenciesPtr[j + i * numProcs]); } printf("\n"); } free(latenciesPtr); } free(latencies); _aligned_free(bouncyBase); return 0; } float TimeThreads(unsigned int processor1, unsigned int processor2, uint64_t iter, LatencyData lat1, LatencyData lat2, DWORD (*threadFunc)(LPVOID)) { struct timeb start, end; HANDLE testThreads[2]; DWORD tid1, tid2; testThreads[0] = CreateThread(NULL, 0, threadFunc, &lat1, CREATE_SUSPENDED, &tid1); testThreads[1] = CreateThread(NULL, 0, threadFunc, &lat2, CREATE_SUSPENDED, &tid2); if (testThreads[0] == NULL || testThreads[1] == NULL) { fprintf(stderr, "Failed to create test threads\n"); return -1; } SetThreadAffinityMask(testThreads[0], 1ULL << (uint64_t)processor1); SetThreadAffinityMask(testThreads[1], 1ULL << (uint64_t)processor2); ftime(&start); ResumeThread(testThreads[0]); ResumeThread(testThreads[1]); WaitForMultipleObjects(2, testThreads, TRUE, INFINITE); ftime(&end); int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); float latency = 1e6 * (float)time_diff_ms / (float)iter; fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency); CloseHandle(testThreads[0]); CloseHandle(testThreads[1]); // each thread does interlocked compare and exchange iterations times. divide by 2 to get overall count of locked ops return latency / 2; } /// /// Measures latency from one processor core to another /// /// processor number 1 /// processor number 2 /// Number of iterations /// aligned mem to bounce around /// latency per iteration in ns float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) { LatencyData lat1, lat2; float latency; *bouncy = 0; lat1.iterations = iter; lat1.start = 1; lat1.target = bouncy; lat2.iterations = iter; lat2.start = 2; lat2.target = bouncy; latency = TimeThreads(processor1, processor2, iter, lat1, lat2, LatencyTestThread); return latency; } float RunOwnedTest(unsigned int processor1, unsigned int processor2, uint64_t iter) { LatencyData lat1, lat2; LONG64* target1, * target2; float latency; // drop them on different cache lines target1 = (LONG64*)_aligned_malloc(128, 64); target2 = target1 + 8; if (target1 == NULL) { fprintf(stderr, "Could not allocate aligned mem\n"); } *target1 = 1; *target2 = 0; lat1.iterations = iter; lat1.start = 3; lat1.target = target1; lat1.readTarget = target2; lat2.iterations = iter; lat2.start = 2; lat2.target = target2; lat2.readTarget = target1; latency = TimeThreads(processor1, processor2, iter, lat1, lat2, ReadLatencyTestThread); _aligned_free(target1); return latency; } /// /// Runs one thread of the latency test. should be run in pairs /// Always writes to target /// /// Latency test params /// next value that would have been written to shared memory DWORD WINAPI LatencyTestThread(LPVOID param) { LatencyData *latencyData = (LatencyData *)param; uint64_t current = latencyData->start; while (current <= 2 * latencyData->iterations) { if (_InterlockedCompareExchange64(latencyData->target, current, current - 1) == current - 1) { current += 2; } } return current; } /// /// Similar thing but tries to not bounce cache line ownership /// Instead, threads write to different cache lines /// /// Latency test params /// next value that would have been written to owned mem DWORD WINAPI ReadLatencyTestThread(LPVOID param) { LatencyData* latencyData = (LatencyData*)param; uint64_t current = latencyData->start; uint64_t startTsc = __rdtsc(); while (current <= 2 * latencyData->iterations) { if (*(latencyData->readTarget) == current - 1) { *(latencyData->target) = current; current += 2; _mm_sfence(); } } return current; } ================================================ FILE: CoherencyLatency/CoherencyLatency.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.31025.194 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CoherencyLatency", "CoherencyLatency.vcxproj", "{6D9CCC8C-09F5-484B-8630-BE18A9CF1995}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.ActiveCfg = Debug|x64 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x64.Build.0 = Debug|x64 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.ActiveCfg = Debug|Win32 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Debug|x86.Build.0 = Debug|Win32 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.ActiveCfg = Release|x64 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x64.Build.0 = Release|x64 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.ActiveCfg = Release|Win32 {6D9CCC8C-09F5-484B-8630-BE18A9CF1995}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {A6E60C3D-60ED-4DBF-B4AA-7C1C3A140325} EndGlobalSection EndGlobal ================================================ FILE: CoherencyLatency/CoherencyLatency.vcxproj ================================================ Debug Win32 Release Win32 Debug x64 Release x64 16.0 Win32Proj {6d9ccc8c-09f5-484b-8630-be18a9cf1995} CoherencyLatency 10.0 Application true v142 Unicode Application false v142 true Unicode Application true v142 Unicode Application false v142 true Unicode true false true false Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true ================================================ FILE: CoherencyLatency/Makefile ================================================ include ../Common/arch_detect.mk CFLAGS = -pthread -O3 all: $(TARGET) amd64: $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_amd64 $(LDFLAGS) aarch64: $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_aarch64 $(LDFLAGS) riscv64: $(CC) $(CFLAGS) PThreadsCoherencyLatency.c -o CoherencyLatency_riscv64 $(LDFLAGS) w64: $(CC) $(CFLAGS) CoherencyLatency.cpp -o CoherencyLatency_w64.exe $(LDFLAGS) # w64 can build with mingw 11, which isn't available on jammy ci: amd64 aarch64 riscv64 clean: rm -rf *.o *.zip "ocl-icd-libopencl1*" "OpenCL-SDK*" && find . -type f -executable -delete .PHONY: all ci clean ================================================ FILE: CoherencyLatency/PThreadsCoherencyLatency.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #define ITERATIONS 10000000; // kidding right? #define gettid() syscall(SYS_gettid) typedef struct LatencyThreadData { uint64_t start; uint64_t iterations; volatile uint64_t *target; unsigned int processorIndex; } LatencyData; typedef struct LatencyPairRunData { uint32_t processor1; uint32_t processor2; uint64_t iter; float result; uint64_t *target; } LatencyPairRunData; void *LatencyTestThread(void *param); void *NoLockLatencyTestThread(void *param); void *(*testFunc)(void *) = LatencyTestThread; void *RunTest(void *param); int main(int argc, char *argv[]) { float **latencies; int *parallelTestState; int numProcs, offsets = 1, parallelismFactor = 1; uint64_t iter = ITERATIONS; uint64_t *bouncyArr; numProcs = get_nprocs(); fprintf(stderr, "Number of CPUs: %u\n", numProcs); for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char* arg = argv[argIdx] + 1; if (strncmp(arg, "iterations", 10) == 0) { argIdx++; iter = atoi(argv[argIdx]); fprintf(stderr, "%lu iterations requested\n", iter); } else if (strncmp(arg, "nolock", 6) == 0) { fprintf(stderr, "No locks, plain loads and stores\n"); testFunc = NoLockLatencyTestThread; } else if (strncmp(arg, "offset", 6) == 0) { argIdx++; offsets = atoi(argv[argIdx]); fprintf(stderr, "Offsets: %d\n", offsets); } else if (strncmp(arg, "parallel", 8) == 0) { argIdx++; parallelismFactor = atoi(argv[argIdx]); fprintf(stderr, "Will go for %d runs in parallel\n", parallelismFactor); } } } latencies = (float **)malloc(sizeof(float *) * offsets); parallelTestState = (int *)malloc(sizeof(int) * numProcs * numProcs); memset(latencies, 0, sizeof(float) * offsets); if (0 != posix_memalign((void **)(&bouncyArr), 4096, 4096 * parallelismFactor)) { fprintf(stderr, "Could not allocate aligned mem\n"); return 0; } LatencyPairRunData *pairRunData = (LatencyPairRunData *)malloc(sizeof(LatencyPairRunData) * parallelismFactor); for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) { latencies[offsetIdx] = (float *)malloc(sizeof(float) * numProcs * numProcs); memset(parallelTestState, 0, sizeof(int) * numProcs * numProcs); float *latenciesPtr = latencies[offsetIdx]; while (1) { // select parallelismFactor threads int selectedParallelTestCount = 0; memset(pairRunData, 0, sizeof(LatencyPairRunData) * parallelismFactor); for (int i = 0;i < numProcs && selectedParallelTestCount < parallelismFactor; i++) { for (int j = 0;j < numProcs && selectedParallelTestCount < parallelismFactor; j++) { if (j == i) { latenciesPtr[j + i * numProcs] = 0; continue; } if (parallelTestState[j + i * numProcs] == 1) { fprintf(stderr, "Thread unexpectedly did not complete\n"); exit(0); } if (parallelTestState[j + i * numProcs] == 0) { // neither thread can already have a pending run int validPair = 1; for (int c = 0; c < numProcs; c++) { if (parallelTestState[j + c * numProcs] == 1 || parallelTestState[c + i * numProcs] == 1 || parallelTestState[i + c * numProcs] == 1 || parallelTestState[c + j * numProcs] == 1) { validPair = 0; break; } } if (!validPair) continue; // for SMT enabled CPUs, check sibling threads. will do later parallelTestState[j + i * numProcs] = 1; pairRunData[selectedParallelTestCount].processor1 = i; pairRunData[selectedParallelTestCount].processor2 = j; pairRunData[selectedParallelTestCount].iter = iter; pairRunData[selectedParallelTestCount].result = 0.0f; pairRunData[selectedParallelTestCount].target = bouncyArr + (512 * selectedParallelTestCount + 8 * offsetIdx); fprintf(stderr, "Selected %d -> %d\n", i, j); selectedParallelTestCount++; } } } if (selectedParallelTestCount == 0) break; // launch threads fprintf(stderr, "Selected %d pairs for parallel testing\n", selectedParallelTestCount); pthread_t *testThreads = (pthread_t *)malloc(selectedParallelTestCount * sizeof(pthread_t)); memset(testThreads, 0, selectedParallelTestCount * sizeof(pthread_t)); for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) { if (pairRunData[parallelIdx].processor1 == 0 && pairRunData[parallelIdx].processor2 == 0) break; pthread_create(testThreads + parallelIdx, NULL, RunTest, (void *)(pairRunData + parallelIdx)); } // join threads for (int parallelIdx = 0; parallelIdx < selectedParallelTestCount; parallelIdx++) { pthread_join(testThreads[parallelIdx], NULL); int i = pairRunData[parallelIdx].processor1; int j = pairRunData[parallelIdx].processor2; latenciesPtr[j + i * numProcs] = pairRunData[parallelIdx].result; parallelTestState[j + i * numProcs] = 2; } free(testThreads); } } for (int offsetIdx = 0; offsetIdx < offsets; offsetIdx++) { float *latenciesPtr = latencies[offsetIdx]; printf("Cache line offset: %d\n", offsetIdx); for (int i = 0;i < numProcs; i++) { for (int j = 0;j < numProcs; j++) { if (j != 0) printf(","); if (j == i) printf("x"); // to maintain consistency, divide by 2 (see justification in windows version) else printf("%f", latenciesPtr[j + i * numProcs] / 2); } printf("\n"); } free(latenciesPtr); } free(parallelTestState); free(pairRunData); free(latencies); free(bouncyArr); return 0; } // run test and gather timing data using the specified thread function float TimeThreads(unsigned int proc1, unsigned int proc2, uint64_t iter, LatencyData *lat1, LatencyData *lat2, void *(*threadFunc)(void *)) { struct timeval startTv, endTv; struct timezone startTz, endTz; pthread_t testThreads[2]; int t1rc, t2rc; void *res1, *res2; gettimeofday(&startTv, &startTz); t1rc = pthread_create(&testThreads[0], NULL, threadFunc, (void *)lat1); t2rc = pthread_create(&testThreads[1], NULL, threadFunc, (void *)lat2); if (t1rc != 0 || t2rc != 0) { fprintf(stderr, "Could not create threads\n"); return 0; } pthread_join(testThreads[0], &res1); pthread_join(testThreads[1], &res2); gettimeofday(&endTv, &endTz); uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float latency = 1e6 * (float)time_diff_ms / (float)iter; return latency; } // test latency between two logical CPUs // float RunTest(unsigned int processor1, unsigned int processor2, uint64_t iter) { void *RunTest(void *param) { LatencyPairRunData *pairRunData = (LatencyPairRunData *)param; uint32_t processor1 = pairRunData->processor1; uint32_t processor2 = pairRunData->processor2; uint64_t iter = pairRunData->iter; LatencyData lat1, lat2; float latency; *(pairRunData->target) = 0; lat1.iterations = iter; lat1.start = 1; lat1.target = pairRunData->target; lat1.processorIndex = processor1; lat2.iterations = iter; lat2.start = 2; lat2.target = pairRunData->target; lat2.processorIndex = processor2; latency = TimeThreads(processor1, processor2, iter, &lat1, &lat2, NoLockLatencyTestThread); fprintf(stderr, "%d to %d: %f ns\n", processor1, processor2, latency); pairRunData->result = latency; return NULL; } void *LatencyTestThread(void *param) { LatencyData *latencyData = (LatencyData *)param; cpu_set_t cpuset; uint64_t current = latencyData->start; CPU_ZERO(&cpuset); CPU_SET(latencyData->processorIndex, &cpuset); sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); //fprintf(stderr, "thread %ld set affinity %d\n", gettid(), latencyData->processorIndex); while (current <= 2 * latencyData->iterations) { if (__sync_bool_compare_and_swap(latencyData->target, current - 1, current)) current += 2; } pthread_exit(NULL); } void *NoLockLatencyTestThread(void *param) { LatencyData *latencyData = (LatencyData *)param; cpu_set_t cpuset; uint64_t current = latencyData->start; CPU_ZERO(&cpuset); CPU_SET(latencyData->processorIndex, &cpuset); sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); while (current <= 2 * latencyData->iterations) { if (*(latencyData->target) == current - 1) { *(latencyData->target) = current; current += 2; } } pthread_exit(NULL); } ================================================ FILE: CoherencyLatency/c2cparse/Program.cs ================================================ // See https://aka.ms/new-console-template for more information using System; public class C2CParse { public static void Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Need filename as arg"); return; } string[] inputLatencies = null; string[] outputLatencies = null; string inputFile = File.ReadAllText(args[0]); string[] inputLines = inputFile.Split('\n'); for (int row = 0; row < inputLines.Length; row++) { string[] lineSplit = inputLines[row].Split(','); if (inputLatencies == null) { inputLatencies = new string[inputLines.Length * lineSplit.Length]; outputLatencies = new string[inputLines.Length * lineSplit.Length]; if (inputLines.Length != lineSplit.Length) { Console.WriteLine("Line count: {0}, line segments: {1} must be equal", inputLines.Length, lineSplit.Length); return; } } for (int i = 0; i < inputLines.Length; i++) { inputLatencies[row * lineSplit.Length + i] = lineSplit[i]; } } for (int row = 0; row < inputLines.Length; row++) { for (int col = 0; col < inputLines.Length; col++) { string v1 = inputLatencies[row * inputLines.Length + col]; // translate both row and col int newRow = GetCoreIndex(row, 4, 64); int newCol = GetCoreIndex(col, 4, 64); outputLatencies[newRow * inputLines.Length + newCol] = v1; } } for (int row = 0; row < inputLines.Length; row++) { for (int col = 0; col < inputLines.Length; col++) { Console.Write(",{0}", outputLatencies[row * inputLines.Length + col]); } Console.WriteLine(); } } /// /// Convert linux index to windows index /// /// /// /// /// public static int GetCoreIndex(int inputIndex, int smtCount, int coreCount) { int physicalCoreIndex = inputIndex % coreCount; int smtIndex = inputIndex / coreCount; return physicalCoreIndex * smtCount + smtIndex; } } ================================================ FILE: CoherencyLatency/c2cparse/c2cparse.csproj ================================================ Exe net6.0 enable enable ================================================ FILE: CoherencyLatency/c2cparse/c2cparse.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.4.33110.190 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "c2cparse", "c2cparse.csproj", "{F9E172EC-1A9A-4908-9512-4547CD1CFD80}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU Release|Any CPU = Release|Any CPU EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Debug|Any CPU.Build.0 = Debug|Any CPU {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.ActiveCfg = Release|Any CPU {F9E172EC-1A9A-4908-9512-4547CD1CFD80}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {4C3856A5-1183-4D5F-80BE-3D694765A594} EndGlobalSection EndGlobal ================================================ FILE: Common/arch_detect.mk ================================================ TARGET ?= amd64 ifeq ($(OS),Windows_NT) TARGET = w64 else UNAME_M := $(shell uname -m) ifeq ($(UNAME_M),x86_64) TARGET = amd64 endif ifeq ($(UNAME_M),aarch64) TARGET = aarch64 endif ifeq ($(UNAME_M),riscv64) TARGET = riscv64 endif UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) TARGET = darwin endif endif amd64: CC = x86_64-linux-gnu-gcc amd64_numa: CC = x86_64-linux-gnu-gcc aarch64: CC := gcc aarch64_numa: CC = aarch64-linux-gnu-gcc riscv64: CC = riscv64-linux-gnu-gcc w64: CC = x86_64-w64-mingw32-gcc darwin: CC = clang ================================================ FILE: Common/ci_gpumemlatency.sh ================================================ #!/bin/sh make_all () { make amd64 make clean-obj LDFLAGS="-lm -L ocl-icd-arm64/usr/lib/aarch64-linux-gnu -lOpenCL" make aarch64 make clean-obj LDFLAGS="-lm -L ocl-icd-riscv64/usr/lib/riscv64-linux-gnu -lOpenCL" make riscv64 make clean-obj CPPFLAGS="-I OpenCL-SDK-${OCL_VER}-Win-x64/include" LDFLAGS="-lm -L OpenCL-SDK-${OCL_VER}-Win-x64/lib -lOpenCL" make w64 make clean-obj } linux_deps () { for ARCH in arm64 riscv64; do if ! grep -q $ARCH /etc/apt/sources.list; then echo "deb [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list echo "deb-src [arch=${ARCH}] http://ports.ubuntu.com/ubuntu-ports $(lsb_release -c -s) universe" | sudo tee -a /etc/apt/sources.list sudo apt update fi apt-get download "ocl-icd-libopencl1:${ARCH}" find . -type f -name "*${ARCH}*.deb" -exec dpkg-deb -x {} "ocl-icd-${ARCH}" \; done cp ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so.1 ocl-icd-arm64/usr/lib/aarch64-linux-gnu/libOpenCL.so cp ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so.1 ocl-icd-riscv64/usr/lib/riscv64-linux-gnu/libOpenCL.so } w64_deps () { curl -fssLO "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/${OCL_VER}/OpenCL-SDK-${OCL_VER}-Win-x64.zip" unzip "OpenCL-SDK-${OCL_VER}-Win-x64.zip" } linux_deps w64_deps make_all ================================================ FILE: Common/ci_package.sh ================================================ #!/bin/sh PKG="clammarks-$(git rev-parse --short HEAD)" rm -rf "$PKG" "clammarks.txz" mkdir -p "$PKG" for TARGET in "amd64" "aarch64" "riscv64" "w64"; do mkdir "$PKG/$TARGET" for COMPONENT in CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency; do find "$COMPONENT" -type f -name "*$TARGET*" -executable -exec cp {} "$PKG/$TARGET" \; done find "GpuMemLatency" -type f -name "*.cl" -exec cp {} "$PKG/$TARGET" \; done cp "LICENSE" "$PKG" tar caf "clammarks.txz" "$PKG" ================================================ FILE: Common/perfmon.h ================================================ // Stuff that only works on Linux. Should be #ifdef-ed out for mingw cross compilation uint64_t readmsr(uint32_t coreindex, uint32_t msrindex) { char buf[256]; memset(buf, 0, 256); snprintf(buf, 256, "/dev/cpu/%d/msr", coreindex); int fd; uint64_t msrvalue = 0; fd = open(buf, O_RDWR); if (fd == -1) { fprintf(stderr, "Could not open msr\n"); return 0; } lseek(fd, msrindex, SEEK_SET); read(fd, &msrvalue, 8); close(fd); return msrvalue; } #define PERF_NUM_EVENTS 4 struct perf_read_data { uint64_t nr; struct { uint64_t value; uint64_t id; } values[PERF_NUM_EVENTS]; }; struct perf_select_data { uint64_t id; // id used to identify the event when it comes back in a group int fd; // file descriptor struct perf_event_attr attr; uint64_t value; const char *description; }; struct perf_select_data perf_selected_events[PERF_NUM_EVENTS]; struct perf_read_data perfReadData; struct timeval perf_startTv, perf_endTv; uint64_t perf_time_ms; // populates basic properties void initialize_hw_event(struct perf_event_attr *attr, uint64_t cfg, uint32_t hwid) { memset(attr, 0, sizeof(struct perf_event_attr)); // low 32 bits of config = hardware event id // high 32 bits = PMU id (atom/core). Get from /sys/devices//type // on Arrow Lake, atom = 10, core = 4 attr->config = cfg | ((uint64_t)hwid << 32); attr->type = PERF_TYPE_HARDWARE; attr->size = sizeof(struct perf_event_attr); attr->disabled = 1; attr->exclude_kernel = 1; attr->exclude_hv = 1; attr->inherit = 1; // include child threads attr->read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; } void set_hw_event(struct perf_select_data *evt, int groupfd) { evt->fd = syscall(__NR_perf_event_open, &(evt->attr), 0, -1, groupfd, 0); ioctl(evt->fd, PERF_EVENT_IOC_ID, &(evt->id)); } void open_perf_monitoring() { int groupLeaderFd = -1; memset(perf_selected_events, 0, sizeof(struct perf_select_data) * PERF_NUM_EVENTS); perf_selected_events[0].description = "instructions"; initialize_hw_event(&(perf_selected_events[0].attr), PERF_COUNT_HW_INSTRUCTIONS, 0); set_hw_event(perf_selected_events, -1); groupLeaderFd = perf_selected_events[0].fd; perf_selected_events[1].description = "cycles"; initialize_hw_event(&(perf_selected_events[1].attr), PERF_COUNT_HW_CPU_CYCLES, 0); set_hw_event(perf_selected_events + 1, groupLeaderFd); perf_selected_events[2].description = "llc_ref"; initialize_hw_event(&(perf_selected_events[2].attr), 0x4F2E, 0); perf_selected_events[2].attr.type = PERF_TYPE_RAW; set_hw_event(perf_selected_events + 2, groupLeaderFd); perf_selected_events[3].description = "llc_miss"; initialize_hw_event(&(perf_selected_events[3].attr), 0x412E, 0); perf_selected_events[3].attr.type = PERF_TYPE_RAW; set_hw_event(perf_selected_events + 3, groupLeaderFd); } void start_perf_monitoring() { gettimeofday(&perf_startTv, NULL); int groupLeaderFd = perf_selected_events[0].fd; ioctl(groupLeaderFd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); ioctl(groupLeaderFd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); } uint64_t instrs, cycles, llcRef, llcMiss; void stop_perf_monitoring() { int readbytes = 0; int groupLeaderFd = perf_selected_events[0].fd; ioctl(groupLeaderFd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); // fprintf(stderr, "read %d bytes\n", sizeof(struct perf_read_data)); readbytes = read(groupLeaderFd, &perfReadData, sizeof(struct perf_read_data)); //fprintf(stderr, "Read %d bytes into perf_read_data. nr = %lu\n", readbytes, perfReadData.nr); for (int i = 0; i < perfReadData.nr; i++) { for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) { if (perf_selected_events[evt_idx].id == perfReadData.values[i].id) { struct perf_select_data *selected_evt = perf_selected_events + evt_idx; selected_evt->value = perfReadData.values[i].value; // fprintf(stderr, "%s: %lu\n", selected_evt->description, selected_evt->value); } } } gettimeofday(&perf_endTv, NULL); perf_time_ms = ((perf_endTv.tv_sec - perf_startTv.tv_sec) * 1000 + (perf_endTv.tv_usec - perf_startTv.tv_usec) / 1000); } void close_perf_monitoring() { for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) close(perf_selected_events[evt_idx].fd); } void append_perf_header() { for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) { printf(",%s", perf_selected_events[evt_idx].description); } printf(",Time (ms)"); } void append_perf_values() { for (int evt_idx = 0; evt_idx < PERF_NUM_EVENTS; evt_idx++) { printf(",%lu", perf_selected_events[evt_idx].value); } printf(",%lu", perf_time_ms); } ================================================ FILE: Common/timing.c ================================================ #ifdef _MSC_VER #include __declspec(selectany) struct timeb start, end; void start_timing() { ftime(&start); } unsigned int end_timing() { ftime(&end); return 1000 * (end.time - start.time) + (end.millitm - start.millitm); } void start_timing_ts(struct timeb *startTimeb) { ftime(startTimeb); } unsigned int end_timing_ts(struct timeb* startTimeb) { struct timeb end; ftime(&end); return 1000 * (end.time - startTimeb->time) + (end.millitm - startTimeb->millitm); } #else #include #include struct timeval startTv, endTv; void start_timing() { gettimeofday(&startTv, NULL); } unsigned int end_timing() { gettimeofday(&endTv, NULL); return (unsigned int)((endTv.tv_sec - startTv.tv_sec) * 1000 + (endTv.tv_usec - startTv.tv_usec) / 1000); } void start_timing_ts(struct timeval* start) { gettimeofday(start, NULL); } unsigned int end_timing_ts(struct timeval* start) { struct timeval end; gettimeofday(&end, NULL); return (unsigned int)((end.tv_sec - start->tv_sec) * 1000 + (end.tv_usec - start->tv_usec) / 1000); } #endif unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time) { // safety measure to deal with nasty timer precision issues if the system is fast if (last_time < 50) return last_iteration_count * 2; return last_iteration_count * (target_time / last_time); } ================================================ FILE: Common/timing.h ================================================ #ifndef timingincluded #define timingincluded #ifdef _MSC_VER #include #else #include #endif extern struct timeb start, end; inline void start_timing(); inline unsigned int end_timing(); #ifdef _MSC_VER void start_timing_ts(struct timeb* startTimeb); unsigned int end_timing_ts(struct timeb* startTimeb); #else void start_timing_ts(struct timeval* start); unsigned int end_timing_ts(struct timeval* start); #endif unsigned long long scale_iterations_to_target(unsigned long long last_iteration_count, float last_time, float target_time); #endif ================================================ FILE: CoreClockChecker/BoostClockChecker.c ================================================ #include #include #include #include #include #include #include extern uint64_t clktsctest(uint64_t iterations) __attribute((ms_abi)); int main(int argc, char *argv[]) { struct timeval startTv, endTv; uint64_t iterations = 500000, samples = 100; unsigned int sleepSeconds = 5; time_t time_diff_ms; for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "samples", 7) == 0) { argIdx++; samples = atol(argv[argIdx]); } else if (strncmp(arg, "iterations", 10) == 0) { argIdx++; iterations = atol(argv[argIdx]); } else if (strncmp(arg, "sleep", 5) == 0) { argIdx++; sleepSeconds = atoi(argv[argIdx]); } } } sleep(sleepSeconds); uint64_t *measuredTscs = malloc(samples * sizeof(uint64_t)); for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) { uint64_t elapsedTsc = clktsctest(iterations); measuredTscs[sampleIdx] = elapsedTsc; } fprintf(stderr, "Used %lu samples\n", samples); fprintf(stderr, "Used %lu iterations\n", iterations); // figure out TSC to real time ratio fprintf(stderr, "Checking TSC ratio...\n"); uint64_t iterationsHi = 8e9; // should be a couple seconds at least? gettimeofday(&startTv, NULL); uint64_t referenceElapsedTsc = clktsctest(iterationsHi); gettimeofday(&endTv, NULL); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float tsc_per_ms = (float)referenceElapsedTsc / (float)time_diff_ms; float tsc_per_ns = tsc_per_ms / 1e6; fprintf(stderr, "TSC = %lu, elapsed ms = %lu\n", referenceElapsedTsc, time_diff_ms); fprintf(stderr, "TSC per ms: %f, TSC per ns: %f\n", tsc_per_ms, tsc_per_ns); printf("Time (ms), Clk (GHz), TSC\n"); float elapsedTime = 0; for (uint64_t sampleIdx = 0; sampleIdx < samples; sampleIdx++) { // (tsc / ms) * tsc = 1 / ms float elapsedTimeMs = measuredTscs[sampleIdx] / tsc_per_ms; elapsedTime += elapsedTimeMs; float latency = 1e6 * elapsedTimeMs / (float)iterations; float addsPerNs = 1 / latency; printf("%f,%f,%lu\n", elapsedTime, addsPerNs, measuredTscs[sampleIdx]); } return 0; } ================================================ FILE: CoreClockChecker/BoostClockChecker_arm.s ================================================ .text .global clktsctest .global _clktsctest .balign 4 /* x0 = iterations, return elapsed TSC in x0 */ _clktsctest: clktsctest: sub sp, sp, #0x40 stp x10, x11, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x14, x15, [sp, #0x30] mov x10, 1 mov x11, 20 mov x12, 0 /* stackoverflow says this is a good idea */ mrs x14, cntvct_el0 clktsctest_loop: add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 add x12, x12, x10 sub x0, x0, x11 cbnz x0, clktsctest_loop mrs x15, cntvct_el0 sub x0, x15, x14 ldp x14, x15, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x10, x11, [sp, #0x10] add sp, sp, #0x40 ret ================================================ FILE: CoreClockChecker/BoostClockChecker_x86.s ================================================ .global clktsctest /* rcx = iterations, return elapsed TSC in rax */ clktsctest: push %rdx push %rbx push %r8 push %r9 push %r10 mov %rcx, %rdi mov $1, %r8 mov $20, %r9 xor %rbx, %rbx rdtsc /* high 32 bits in EDX, low 32 bits in EAX */ shl $32, %rdx /* shift high 32 bits into upper half of EDX */ add %rax, %rdx /* place full 64-bit value in rdx */ mov %rdx, %r10 clktsctest_loop: add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx sub %r9, %rdi jnz clktsctest_loop rdtsc shl $32, %rdx add %rdx, %rax /* now rax has the new value */ sub %r10, %rax /* subtract old TSC value from the new one, which should be larger */ pop %r10 pop %r9 pop %r8 pop %rbx pop %rdx ret ================================================ FILE: CoreClockChecker/CoreClockChecker.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #define MSR_RAPL_PWR_UNIT 0xC0010299 #define HWCR 0xC0010015 #define MSR_CORE_ENERGY_STAT 0xC001029A #define MSR_PKG_ENERGY_STAT 0xC001029B #define INTEL_MSR_RAPL_PWR_UNIT 0x606 #define INTEL_MSR_PP0_ENERGY_STATUS 0x639 #define INTEL_MSR_PKG_ENERGY_STATUS 0x611 extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi)); void detectCpuMaker(); void setBoost(int on); void setAffinity(int core); int openMsr(int core); uint64_t readMsr(int fd, uint32_t addr); void writeMsr(int fd, uint32_t addr, uint64_t value); float getEnergyStatusUnits(); uint64_t getCoreEnergyStat(int core); uint64_t getPkgEnergyStat(int core); uint64_t getTotalCoreEnergy(); int *msrFds; int amdCpu = 1; int numProcs = 0; int main(int argc, char *argv[]) { struct timeval startTv, endTv; time_t time_diff_ms; float latency, clockSpeedGhz, energyUnits; uint64_t startEnergy, endEnergy, startPkgEnergy, endPkgEnergy; uint64_t iterationsHigh = 8e9; detectCpuMaker(); numProcs = get_nprocs(); fprintf(stderr, "Number of CPUs: %u\n", numProcs); msrFds = (int *)malloc(sizeof(int) * numProcs); memset(msrFds, 0, sizeof(int) * numProcs); if (argc > 1 && strncmp(argv[1], "disableboost", 12) == 0) { setBoost(0); } else if (argc > 1 && strncmp(argv[1], "enableboost", 11) == 0) { setBoost(1); } else if (argc > 1 && strncmp(argv[1], "power", 5) == 0) { iterationsHigh *= 2; // try for more accuracy energyUnits = getEnergyStatusUnits(); printf("Core, Core Power, Package Power\n"); for (int i = 0; i < numProcs; i++) { setAffinity(i); gettimeofday(&startTv, NULL); startEnergy = getCoreEnergyStat(i); startPkgEnergy = getPkgEnergyStat(i); clktest(iterationsHigh); endPkgEnergy = getPkgEnergyStat(i); endEnergy = getCoreEnergyStat(i); gettimeofday(&endTv, NULL); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh; clockSpeedGhz = 1 / latency; //printf("runtime: %llu ms\n", time_diff_ms); //printf("%d, %f GHz\n", i, clockSpeedGhz); printf("%d, %f, %f\n", i, ((endEnergy - startEnergy) * energyUnits) / (time_diff_ms / 1000), ((endPkgEnergy - startPkgEnergy) * energyUnits) / (time_diff_ms / 1000)); } } else if (argc > 2 && strncmp(argv[1], "measurecmd", 9) == 0) { int rc; float coreJoules, pkgJoules; fprintf(stderr, "argv[2] is %s\nOnly handling Intel at the moment\n", argv[2]); energyUnits = getEnergyStatusUnits(); gettimeofday(&startTv, NULL); startEnergy = getTotalCoreEnergy(); startPkgEnergy = getPkgEnergyStat(0); rc = system(argv[2]); endEnergy = getTotalCoreEnergy(); endPkgEnergy = getPkgEnergyStat(0); gettimeofday(&endTv, NULL); fprintf(stderr, "system() returned %d\n", rc); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); coreJoules = (endEnergy - startEnergy) * energyUnits; pkgJoules = (endPkgEnergy - startPkgEnergy) * energyUnits; printf("Core Joules: %f\n", coreJoules); printf("Package Joules: %f\n", pkgJoules); printf("Elapsed time, seconds: %f\n", (double)time_diff_ms / 1000); } else { for (int i = 0; i < numProcs; i++) { setAffinity(i); gettimeofday(&startTv, NULL); clktest(iterationsHigh); gettimeofday(&endTv, NULL); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh; clockSpeedGhz = 1 / latency; //printf("runtime: %llu ms\n", time_diff_ms); printf("%d, %f GHz\n", i, clockSpeedGhz); } } free(msrFds); return 0; } void detectCpuMaker() { uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx; uint32_t *uintPtr; char cpuName[13]; amdCpu = 0; __cpuid_count(0, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx); uintPtr = (uint32_t *)cpuName; uintPtr[0] = cpuidEbx; uintPtr[1] = cpuidEdx; uintPtr[2] = cpuidEcx; cpuName[12] = 0; fprintf(stderr, "CPU name: %s\n", cpuName); if (memcmp(cpuName, "GenuineIntel", 12) == 0) { amdCpu = 0; fprintf(stderr, "Looks like Intel\n"); } else if (memcmp(cpuName, "AuthenticAMD", 12) == 0) { amdCpu = 1; fprintf(stderr, "Looks like AMD\n"); } } void setAffinity(int core) { int rc; cpu_set_t cpuset; pthread_t thread = pthread_self(); CPU_ZERO(&cpuset); CPU_SET(core, &cpuset); rc = pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); if (rc != 0) { fprintf(stderr, "unable to set thread affinity to %d\n", core); } } int openMsr(int core) { char msrFilename[255]; int fd; sprintf(msrFilename, "/dev/cpu/%d/msr", core); fd = open(msrFilename, O_RDWR); if (fd < 0) { fprintf(stderr, "Could not open MSR file, core %d\n", core); return -1; } return fd; } uint64_t readMsr(int fd, uint32_t addr) { uint64_t result, bytesRead; bytesRead = pread(fd, &result, sizeof(result), addr); if (bytesRead != sizeof(result)) { fprintf(stderr, "Could not read from fd %d, msr %u\n", fd, addr); } return result; } void writeMsr(int fd, uint32_t addr, uint64_t value) { uint64_t bytesWritten, newValue; bytesWritten = pwrite(fd, &value, sizeof(value), addr); if (bytesWritten != sizeof(value)) { fprintf(stderr, "Could not write to fd %d, msr %u, value %lu\n", fd, addr, value); } newValue = readMsr(fd, addr); if (value != newValue) { fprintf(stderr, "Wrote to fd %d, msr %u, value %lu, but write did not take effect\n", fd, addr, value); } } void setBoost(int on) { uint64_t hwcrValue; for (int i = 0; i < numProcs; i++) { setAffinity(i); if (!msrFds[i]) msrFds[i] = openMsr(i); hwcrValue = readMsr(msrFds[i], HWCR); if (on) { hwcrValue &= ~(1UL << 25); // unset bit to request CPB on //fprintf(stderr, "Requesting CPB on (unsetting bit 25 in HWCR): 0x%08x\n", hwcrValue); } else { hwcrValue |= (1UL << 25); // set bit to disable CPB //fprintf(stderr, "Requesting CPB off (setting bit 25 in HWCR): 0x%08x\n", hwcrValue); } writeMsr(msrFds[i], HWCR, hwcrValue); } } float getEnergyStatusUnits() { uint64_t energyUnits, raplPwrUnit; setAffinity(0); if (!msrFds[0]) msrFds[0] = openMsr(0); if (amdCpu) { raplPwrUnit = readMsr(msrFds[0], MSR_RAPL_PWR_UNIT); } else { raplPwrUnit = readMsr(msrFds[0], INTEL_MSR_RAPL_PWR_UNIT); } energyUnits = (raplPwrUnit >> 8) & 0x1F; return (float)pow(0.5, (double)energyUnits); } uint64_t getCoreEnergyStat(int core) { if (!msrFds[core]) msrFds[core] = openMsr(core); if (amdCpu) return readMsr(msrFds[core], MSR_CORE_ENERGY_STAT); else return readMsr(msrFds[core], INTEL_MSR_PP0_ENERGY_STATUS); } uint64_t getPkgEnergyStat(int core) { if (!msrFds[core]) msrFds[core] = openMsr(core); if (amdCpu) return readMsr(msrFds[core], MSR_PKG_ENERGY_STAT); else return readMsr(msrFds[core], INTEL_MSR_PKG_ENERGY_STATUS); } uint64_t getTotalCoreEnergy() { if (amdCpu) { uint64_t totalCoreEnergy = 0; // only testing the 5950X and 3950X for now, and physical cores // are 0-15 on linux. hack around this until I have time to // programatically figure out SMT siblings for (int i = 0; i < 16; i++) { totalCoreEnergy += getCoreEnergyStat(i); } return totalCoreEnergy; } else { // intel does not track power per core return getCoreEnergyStat(0); } } ================================================ FILE: CoreClockChecker/CoreClockChecker_x86.s ================================================ .global clktest /* %rdi = arg0 = iteration count */ clktest: push %rbx push %r8 push %r9 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx clktest_loop: add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx sub %r9, %rdi jnz clktest_loop pop %r9 pop %r8 pop %rbx ret ================================================ FILE: CoreClockChecker/Makefile ================================================ include ../Common/arch_detect.mk CFLAGS = -O3 LDFLAGS = -lm all: $(TARGET) amd64: $(CC) $(CFLAGS) -pthread CoreClockChecker.c CoreClockChecker_x86.s -o CoreClockChecker_amd64 $(LDFLAGS) $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_amd64 $(LDFLAGS) aarch64: $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_arm.s -o BoostClockChecker_aarch64 $(LDFLAGS) w64: $(CC) $(CFLAGS) BoostClockChecker.c BoostClockChecker_x86.s -o BoostClockChecker_w64.exe $(LDFLAGS) ci: amd64 aarch64 w64 clean: rm -f *.o && find . -type f -executable -delete .PHONY: all ci clean ================================================ FILE: CoreClockChecker/WinCoreClockChecker/CoreClockCheckFunctions.asm ================================================ section .text bits 64 global clktest ; rcx = iteration count ; rdx = address of memory location to monitor ; return elapsed tsc clktest: push rdx push rbx push r8 push r9 push r10 push r11 xor rbx, rbx mov r8, 1 ; GLC will eliminate adds with immediates or increments clktest_loop: add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 add rbx, r8 mov r11d, [rdx] test r11d, r11d jnz clktest_loop_end ; early exit condition (someone else exited) sub rcx, 20 jg clktest_loop mov [rdx], r8 clktest_loop_end: mov rax, rbx pop r11 pop r10 pop r9 pop r8 pop rbx pop rdx ret ================================================ FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.cpp ================================================ // WinCoreClockChecker.cpp : This file contains the 'main' function. Program execution begins and ends there. // #include #include #include #include #include extern "C" uint64_t clktest(uint64_t iterations, uint64_t *flag); int ECoreTestOrder[] = { 2, 3, 4, 5, 6, 7, 8, 9 }; int BackwardECoreTestOrder[] = { 9, 8, 7, 6, 5, 4, 3, 2 }; int AlternatingECoreTestOrder[] = { 2, 6, 3, 7, 4, 8, 5, 9 }; int PCoreTestOrder[] = { 12, 10, 14, 16, 18, 0 }; int AllECores[] = { 20, 21, 2, 3, 4, 5, 6, 7, 8, 9 }; int AllCores[] = { 12, 10, 14, 16, 18, 0, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21 }; struct ClockTestData { uint64_t iterations; uint64_t completed_iterations; uint64_t *flag; }; float* runMtClockTest(int* cores, int nCores); void PrintResults(int* cores, float* results, int coreCount); void RunCoreByCoreClockTest(int* cores, int coreCount); void RunEvenCoreTest(int coreCount); uint64_t start_iterations = 8e9; int main(int argc, char *argv[]) { // Test E-Cores one by one start_iterations = 8e9; if (argc > 1) { int evenCoreCount = atoi(argv[1]); printf("Even Cores, core count %d\n"); RunEvenCoreTest(evenCoreCount); } int eCoreCount = sizeof(ECoreTestOrder) / sizeof(int); printf("E-Cores, Warmup:\n"); RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int)); printf("E-Cores, filling one cluster first:\n"); RunCoreByCoreClockTest(ECoreTestOrder, sizeof(ECoreTestOrder) / sizeof(int)); printf("E-Cores, filling other cluster first but still one cluster at a time:\n"); RunCoreByCoreClockTest(BackwardECoreTestOrder, sizeof(BackwardECoreTestOrder) / sizeof(int)); printf("E-Cores, alternating cores between clusters:\n"); RunCoreByCoreClockTest(AlternatingECoreTestOrder, sizeof(AlternatingECoreTestOrder) / sizeof(int)); printf("E-Cores, LPE first:\n"); RunCoreByCoreClockTest(AllECores, sizeof(AllECores) / sizeof(int)); start_iterations = 12e9; printf("P-Cores, warmup:\n"); RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int)); printf("P-Cores, fastest core first:\n"); RunCoreByCoreClockTest(PCoreTestOrder, sizeof(PCoreTestOrder) / sizeof(int)); printf("All cores, fastest core first:\n"); RunCoreByCoreClockTest(AllCores, sizeof(AllCores) / sizeof(int)); return 0; } void RunEvenCoreTest(int coreCount) { int* coreSequence = (int *)malloc(sizeof(int) * coreCount); for (int i = 0; i < coreCount; i++) { coreSequence[i] = i * 2; } RunCoreByCoreClockTest(coreSequence, coreCount); free(coreSequence); } void RunCoreByCoreClockTest(int *cores, int coreCount) { float* coreByCoreResults = (float*)malloc(sizeof(float) * coreCount * coreCount); memset(coreByCoreResults, 0, sizeof(float) * coreCount * coreCount); for (int i = 0; i < coreCount; i++) { float* results = runMtClockTest(cores, i + 1); for (int j = 0; j < (i + 1); j++) { coreByCoreResults[coreCount * i + j] = results[j]; } free(results); } PrintResults(cores, coreByCoreResults, coreCount); free(coreByCoreResults); } void PrintResults(int *cores, float* results, int coreCount) { // print csv header for (int i = 0; i < coreCount; i++) { printf(",%d", cores[i]); } printf("\n"); for (int currentCoreCountIndex = 0; currentCoreCountIndex < coreCount; currentCoreCountIndex++) { printf("%d", currentCoreCountIndex + 1); for (int currentCoreIdx = 0; currentCoreIdx < coreCount; currentCoreIdx++) { float currentResult = results[coreCount * currentCoreCountIndex + currentCoreIdx]; if (currentResult != 0.0f) printf(",%f", currentResult); else printf(",-"); } printf("\n"); } } DWORD WINAPI ClockTestThread(LPVOID param) { struct ClockTestData* testData = (struct ClockTestData*)param; testData->completed_iterations = clktest(testData->iterations, testData->flag); return 0; } // cores = array of test order -> logical core id float* runMtClockTest(int* cores, int nCores) { struct timeb start, end; struct ClockTestData* threadData = (struct ClockTestData*)malloc(sizeof(struct ClockTestData) * nCores); float* results = (float*)malloc(sizeof(float) * nCores); memset(results, 0, sizeof(float) * nCores); HANDLE* testThreads = (HANDLE*)malloc(sizeof(HANDLE) * nCores); // try to align test times float maxThreadTsc, minThreadTsc; float time_diff_sec; uint64_t flag = 0; for (int i = 0; i < nCores; i++) { threadData[i].iterations = start_iterations; threadData[i].flag = &flag; testThreads[i] = CreateThread(NULL, 0, ClockTestThread, threadData + i, CREATE_SUSPENDED, NULL); SetThreadAffinityMask(testThreads[i], 1ULL << (uint64_t)cores[i]); } ftime(&start); for (int i = 0; i < nCores; i++) { ResumeThread(testThreads[i]); } WaitForMultipleObjects(nCores, testThreads, TRUE, INFINITE); ftime(&end); time_diff_sec = (float)(end.time - start.time) + 0.001f * (end.millitm - start.millitm); for (int i = 0; i < nCores; i++) { // fprintf(stderr, "Core %d: %llu iterations in %f sec\n", cores[i], threadData[i].completed_iterations, time_diff_sec); float ghz = ((float)threadData[i].completed_iterations / 1e9) / time_diff_sec; // fprintf(stderr, "Core %d: %f GHz\n", cores[i], ghz); results[i] = ghz; } free(testThreads); free(threadData); return results; } ================================================ FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.9.34723.18 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "WinCoreClockChecker", "WinCoreClockChecker.vcxproj", "{D70EC1DD-794C-4156-8483-227E566CC76B}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.ActiveCfg = Debug|x64 {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x64.Build.0 = Debug|x64 {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.ActiveCfg = Debug|Win32 {D70EC1DD-794C-4156-8483-227E566CC76B}.Debug|x86.Build.0 = Debug|Win32 {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.ActiveCfg = Release|x64 {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x64.Build.0 = Release|x64 {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.ActiveCfg = Release|Win32 {D70EC1DD-794C-4156-8483-227E566CC76B}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {6AA7051E-EAEF-48CA-9C08-8641D57B3EB1} EndGlobalSection EndGlobal ================================================ FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj ================================================ Debug Win32 Release Win32 Debug x64 Release x64 17.0 Win32Proj {d70ec1dd-794c-4156-8483-227e566cc76b} WinCoreClockChecker 10.0 Application true v143 Unicode Application false v143 true Unicode Application true v143 Unicode Application false v143 true Unicode Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Document nasm -f win64 CoreClockCheckFunctions.asm CoreClockCheckFunctions.obj nasm -f win64 CoreClockCheckFunctions.asm CoreClockCheckFunctions.obj nasm -f win64 CoreClockCheckFunctions.asm CoreClockCheckFunctions.obj nasm -f win64 CoreClockCheckFunctions.asm CoreClockCheckFunctions.obj ================================================ FILE: CoreClockChecker/WinCoreClockChecker/WinCoreClockChecker.vcxproj.filters ================================================  {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms Source Files Source Files ================================================ FILE: GpuMemLatency/Makefile ================================================ include ../Common/arch_detect.mk OCL_VER = v2023.04.17 CI_SCRIPT = ../Common/ci_gpumemlatency.sh CFLAGS = -O3 -I ../Common DEPS = ../Common/timings.h OBJ = opencltest.o latency_test.o bw_test.o common.o atomic_test.o instruction_rate.o timing.o LDFLAGS ?= -lm -lOpenCL ifeq ($(TARGET), Darwin) LDFLAGS = -lm -framework OpenCL endif all: $(TARGET) GpuMemLatency: $(OBJ) $(CC) $(CPPFLAGS) $(CFLAGS) $^ -o $@ $(LDFLAGS) %.o: %.c $(DEPS) $(CC) $(CFLAGS) -c -o $@ $< timing.o: $(CC) $(CFLAGS) -c ../Common/timing.c -o timing.o amd64: $(OBJ) $(CC) $(CFLAGS) $^ -o GpuMemLatency_amd64 $(LDFLAGS) aarch64: $(OBJ) $(CC) $(CFLAGS) $^ -o GpuMemLatency_aarch64 $(LDFLAGS) riscv64: $(OBJ) $(CC) $(CFLAGS) $^ -o GpuMemLatency_riscv64 $(LDFLAGS) w64: $(OBJ) $(CC) $(CFLAGS) $^ -o GpuMemLatency_w64.exe $(LDFLAGS) darwin: $(OBJ) $(CC) $(CFLAGS) $^ -o GpuMemLatency_darwin $(LDFLAGS) ci: clean @OCL_VER=$(OCL_VER) sh $(CI_SCRIPT) clean-ci: rm -rf "*.deb" "*.zip" "ocl-icd-*" "OpenCL-SDK-*" clean-obj: rm -f *.o clean: clean-ci clean-obj find . -type f -executable -delete .PHONY: all ci clean-ci clean-obj clean ================================================ FILE: GpuMemLatency/OpenCL/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: GpuMemLatency/OpenCL/README.md ================================================ # OpenCLTM API Headers This repository contains C language headers for the OpenCL API. The authoritative public repository for these headers is located at: https://github.com/KhronosGroup/OpenCL-Headers Issues, proposed fixes for issues, and other suggested changes should be created using Github. ## Branch Structure The OpenCL API headers in this repository are Unified headers and are designed to work with all released OpenCL versions. This differs from previous OpenCL API headers, where version-specific API headers either existed in separate branches, or in separate folders in a branch. ## Compiling for a Specific OpenCL Version By default, the OpenCL API headers in this repository are for the latest OpenCL version (currently OpenCL 2.2). To use these API headers to target a different OpenCL version, an application may `#define` the preprocessor value `CL_TARGET_OPENCL_VERSION` before including the OpenCL API headers. The `CL_TARGET_OPENCL_VERSION` is a three digit decimal value representing the OpenCL API version. For example, to enforce usage of no more than the OpenCL 1.2 APIs, you may include the OpenCL API headers as follows: ``` #define CL_TARGET_OPENCL_VERSION 120 #include ``` ## Directory Structure ``` README.md This file LICENSE Source license for the OpenCL API headers CL/ Unified OpenCL API headers tree ``` ## License See [LICENSE](LICENSE). --- OpenCL and the OpenCL logo are trademarks of Apple Inc. used by permission by Khronos. ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_H #define __OPENCL_CL_H #include #include #ifdef __cplusplus extern "C" { #endif /******************************************************************************/ typedef struct _cl_platform_id * cl_platform_id; typedef struct _cl_device_id * cl_device_id; typedef struct _cl_context * cl_context; typedef struct _cl_command_queue * cl_command_queue; typedef struct _cl_mem * cl_mem; typedef struct _cl_program * cl_program; typedef struct _cl_kernel * cl_kernel; typedef struct _cl_event * cl_event; typedef struct _cl_sampler * cl_sampler; typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ typedef cl_ulong cl_bitfield; typedef cl_bitfield cl_device_type; typedef cl_uint cl_platform_info; typedef cl_uint cl_device_info; typedef cl_bitfield cl_device_fp_config; typedef cl_uint cl_device_mem_cache_type; typedef cl_uint cl_device_local_mem_type; typedef cl_bitfield cl_device_exec_capabilities; #ifdef CL_VERSION_2_0 typedef cl_bitfield cl_device_svm_capabilities; #endif typedef cl_bitfield cl_command_queue_properties; #ifdef CL_VERSION_1_2 typedef intptr_t cl_device_partition_property; typedef cl_bitfield cl_device_affinity_domain; #endif typedef intptr_t cl_context_properties; typedef cl_uint cl_context_info; #ifdef CL_VERSION_2_0 typedef cl_bitfield cl_queue_properties; #endif typedef cl_uint cl_command_queue_info; typedef cl_uint cl_channel_order; typedef cl_uint cl_channel_type; typedef cl_bitfield cl_mem_flags; #ifdef CL_VERSION_2_0 typedef cl_bitfield cl_svm_mem_flags; #endif typedef cl_uint cl_mem_object_type; typedef cl_uint cl_mem_info; #ifdef CL_VERSION_1_2 typedef cl_bitfield cl_mem_migration_flags; #endif typedef cl_uint cl_image_info; #ifdef CL_VERSION_1_1 typedef cl_uint cl_buffer_create_type; #endif typedef cl_uint cl_addressing_mode; typedef cl_uint cl_filter_mode; typedef cl_uint cl_sampler_info; typedef cl_bitfield cl_map_flags; #ifdef CL_VERSION_2_0 typedef intptr_t cl_pipe_properties; typedef cl_uint cl_pipe_info; #endif typedef cl_uint cl_program_info; typedef cl_uint cl_program_build_info; #ifdef CL_VERSION_1_2 typedef cl_uint cl_program_binary_type; #endif typedef cl_int cl_build_status; typedef cl_uint cl_kernel_info; #ifdef CL_VERSION_1_2 typedef cl_uint cl_kernel_arg_info; typedef cl_uint cl_kernel_arg_address_qualifier; typedef cl_uint cl_kernel_arg_access_qualifier; typedef cl_bitfield cl_kernel_arg_type_qualifier; #endif typedef cl_uint cl_kernel_work_group_info; #ifdef CL_VERSION_2_1 typedef cl_uint cl_kernel_sub_group_info; #endif typedef cl_uint cl_event_info; typedef cl_uint cl_command_type; typedef cl_uint cl_profiling_info; #ifdef CL_VERSION_2_0 typedef cl_bitfield cl_sampler_properties; typedef cl_uint cl_kernel_exec_info; #endif #ifdef CL_VERSION_3_0 typedef cl_bitfield cl_device_atomic_capabilities; typedef cl_uint cl_khronos_vendor_id; typedef cl_bitfield cl_mem_properties; typedef cl_uint cl_version; #endif typedef struct _cl_image_format { cl_channel_order image_channel_order; cl_channel_type image_channel_data_type; } cl_image_format; #ifdef CL_VERSION_1_2 typedef struct _cl_image_desc { cl_mem_object_type image_type; size_t image_width; size_t image_height; size_t image_depth; size_t image_array_size; size_t image_row_pitch; size_t image_slice_pitch; cl_uint num_mip_levels; cl_uint num_samples; #ifdef CL_VERSION_2_0 #ifdef __GNUC__ __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ #endif #ifdef _MSC_VER #pragma warning( push ) #pragma warning( disable : 4201 ) /* Prevents warning about nameless struct/union in /W4 /Za builds */ #endif union { #endif cl_mem buffer; #ifdef CL_VERSION_2_0 cl_mem mem_object; }; #ifdef _MSC_VER #pragma warning( pop ) #endif #endif } cl_image_desc; #endif #ifdef CL_VERSION_1_1 typedef struct _cl_buffer_region { size_t origin; size_t size; } cl_buffer_region; #endif #ifdef CL_VERSION_3_0 #define CL_NAME_VERSION_MAX_NAME_SIZE 64 typedef struct _cl_name_version { cl_version version; char name[CL_NAME_VERSION_MAX_NAME_SIZE]; } cl_name_version; #endif /******************************************************************************/ /* Error Codes */ #define CL_SUCCESS 0 #define CL_DEVICE_NOT_FOUND -1 #define CL_DEVICE_NOT_AVAILABLE -2 #define CL_COMPILER_NOT_AVAILABLE -3 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 #define CL_OUT_OF_RESOURCES -5 #define CL_OUT_OF_HOST_MEMORY -6 #define CL_PROFILING_INFO_NOT_AVAILABLE -7 #define CL_MEM_COPY_OVERLAP -8 #define CL_IMAGE_FORMAT_MISMATCH -9 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 #define CL_BUILD_PROGRAM_FAILURE -11 #define CL_MAP_FAILURE -12 #ifdef CL_VERSION_1_1 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 #endif #ifdef CL_VERSION_1_2 #define CL_COMPILE_PROGRAM_FAILURE -15 #define CL_LINKER_NOT_AVAILABLE -16 #define CL_LINK_PROGRAM_FAILURE -17 #define CL_DEVICE_PARTITION_FAILED -18 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 #endif #define CL_INVALID_VALUE -30 #define CL_INVALID_DEVICE_TYPE -31 #define CL_INVALID_PLATFORM -32 #define CL_INVALID_DEVICE -33 #define CL_INVALID_CONTEXT -34 #define CL_INVALID_QUEUE_PROPERTIES -35 #define CL_INVALID_COMMAND_QUEUE -36 #define CL_INVALID_HOST_PTR -37 #define CL_INVALID_MEM_OBJECT -38 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 #define CL_INVALID_IMAGE_SIZE -40 #define CL_INVALID_SAMPLER -41 #define CL_INVALID_BINARY -42 #define CL_INVALID_BUILD_OPTIONS -43 #define CL_INVALID_PROGRAM -44 #define CL_INVALID_PROGRAM_EXECUTABLE -45 #define CL_INVALID_KERNEL_NAME -46 #define CL_INVALID_KERNEL_DEFINITION -47 #define CL_INVALID_KERNEL -48 #define CL_INVALID_ARG_INDEX -49 #define CL_INVALID_ARG_VALUE -50 #define CL_INVALID_ARG_SIZE -51 #define CL_INVALID_KERNEL_ARGS -52 #define CL_INVALID_WORK_DIMENSION -53 #define CL_INVALID_WORK_GROUP_SIZE -54 #define CL_INVALID_WORK_ITEM_SIZE -55 #define CL_INVALID_GLOBAL_OFFSET -56 #define CL_INVALID_EVENT_WAIT_LIST -57 #define CL_INVALID_EVENT -58 #define CL_INVALID_OPERATION -59 #define CL_INVALID_GL_OBJECT -60 #define CL_INVALID_BUFFER_SIZE -61 #define CL_INVALID_MIP_LEVEL -62 #define CL_INVALID_GLOBAL_WORK_SIZE -63 #ifdef CL_VERSION_1_1 #define CL_INVALID_PROPERTY -64 #endif #ifdef CL_VERSION_1_2 #define CL_INVALID_IMAGE_DESCRIPTOR -65 #define CL_INVALID_COMPILER_OPTIONS -66 #define CL_INVALID_LINKER_OPTIONS -67 #define CL_INVALID_DEVICE_PARTITION_COUNT -68 #endif #ifdef CL_VERSION_2_0 #define CL_INVALID_PIPE_SIZE -69 #define CL_INVALID_DEVICE_QUEUE -70 #endif #ifdef CL_VERSION_2_2 #define CL_INVALID_SPEC_ID -71 #define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 #endif /* cl_bool */ #define CL_FALSE 0 #define CL_TRUE 1 #ifdef CL_VERSION_1_2 #define CL_BLOCKING CL_TRUE #define CL_NON_BLOCKING CL_FALSE #endif /* cl_platform_info */ #define CL_PLATFORM_PROFILE 0x0900 #define CL_PLATFORM_VERSION 0x0901 #define CL_PLATFORM_NAME 0x0902 #define CL_PLATFORM_VENDOR 0x0903 #define CL_PLATFORM_EXTENSIONS 0x0904 #ifdef CL_VERSION_2_1 #define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 #endif #ifdef CL_VERSION_3_0 #define CL_PLATFORM_NUMERIC_VERSION 0x0906 #define CL_PLATFORM_EXTENSIONS_WITH_VERSION 0x0907 #endif /* cl_device_type - bitfield */ #define CL_DEVICE_TYPE_DEFAULT (1 << 0) #define CL_DEVICE_TYPE_CPU (1 << 1) #define CL_DEVICE_TYPE_GPU (1 << 2) #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) #ifdef CL_VERSION_1_2 #define CL_DEVICE_TYPE_CUSTOM (1 << 4) #endif #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF /* cl_device_info */ #define CL_DEVICE_TYPE 0x1000 #define CL_DEVICE_VENDOR_ID 0x1001 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C #define CL_DEVICE_ADDRESS_BITS 0x100D #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 #define CL_DEVICE_IMAGE_SUPPORT 0x1016 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 #define CL_DEVICE_MAX_SAMPLERS 0x1018 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 #define CL_DEVICE_ENDIAN_LITTLE 0x1026 #define CL_DEVICE_AVAILABLE 0x1027 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ #ifdef CL_VERSION_2_0 #define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A #endif #define CL_DEVICE_NAME 0x102B #define CL_DEVICE_VENDOR 0x102C #define CL_DRIVER_VERSION 0x102D #define CL_DEVICE_PROFILE 0x102E #define CL_DEVICE_VERSION 0x102F #define CL_DEVICE_EXTENSIONS 0x1030 #define CL_DEVICE_PLATFORM 0x1031 #ifdef CL_VERSION_1_2 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 #endif /* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ #ifdef CL_VERSION_1_1 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C #define CL_DEVICE_OPENCL_C_VERSION 0x103D #endif #ifdef CL_VERSION_1_2 #define CL_DEVICE_LINKER_AVAILABLE 0x103E #define CL_DEVICE_BUILT_IN_KERNELS 0x103F #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 #define CL_DEVICE_PARENT_DEVICE 0x1042 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 #define CL_DEVICE_PARTITION_TYPE 0x1046 #define CL_DEVICE_REFERENCE_COUNT 0x1047 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 #endif #ifdef CL_VERSION_2_0 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B #define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C #define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D #define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E #define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F #define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 #define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 #define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 #define CL_DEVICE_SVM_CAPABILITIES 0x1053 #define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 #define CL_DEVICE_MAX_PIPE_ARGS 0x1055 #define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 #define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 #define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 #define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 #define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A #endif #ifdef CL_VERSION_2_1 #define CL_DEVICE_IL_VERSION 0x105B #define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C #define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D #endif #ifdef CL_VERSION_3_0 #define CL_DEVICE_NUMERIC_VERSION 0x105E #define CL_DEVICE_EXTENSIONS_WITH_VERSION 0x1060 #define CL_DEVICE_ILS_WITH_VERSION 0x1061 #define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION 0x1062 #define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES 0x1063 #define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064 #define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT 0x1065 #define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066 #define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x1067 #define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068 #define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069 /* 0x106A to 0x106E - Reserved for upcoming KHR extension */ #define CL_DEVICE_OPENCL_C_FEATURES 0x106F #define CL_DEVICE_DEVICE_ENQUEUE_SUPPORT 0x1070 #define CL_DEVICE_PIPE_SUPPORT 0x1071 #endif /* cl_device_fp_config - bitfield */ #define CL_FP_DENORM (1 << 0) #define CL_FP_INF_NAN (1 << 1) #define CL_FP_ROUND_TO_NEAREST (1 << 2) #define CL_FP_ROUND_TO_ZERO (1 << 3) #define CL_FP_ROUND_TO_INF (1 << 4) #define CL_FP_FMA (1 << 5) #ifdef CL_VERSION_1_1 #define CL_FP_SOFT_FLOAT (1 << 6) #endif #ifdef CL_VERSION_1_2 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) #endif /* cl_device_mem_cache_type */ #define CL_NONE 0x0 #define CL_READ_ONLY_CACHE 0x1 #define CL_READ_WRITE_CACHE 0x2 /* cl_device_local_mem_type */ #define CL_LOCAL 0x1 #define CL_GLOBAL 0x2 /* cl_device_exec_capabilities - bitfield */ #define CL_EXEC_KERNEL (1 << 0) #define CL_EXEC_NATIVE_KERNEL (1 << 1) /* cl_command_queue_properties - bitfield */ #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) #define CL_QUEUE_PROFILING_ENABLE (1 << 1) #ifdef CL_VERSION_2_0 #define CL_QUEUE_ON_DEVICE (1 << 2) #define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) #endif /* cl_context_info */ #define CL_CONTEXT_REFERENCE_COUNT 0x1080 #define CL_CONTEXT_DEVICES 0x1081 #define CL_CONTEXT_PROPERTIES 0x1082 #ifdef CL_VERSION_1_1 #define CL_CONTEXT_NUM_DEVICES 0x1083 #endif /* cl_context_properties */ #define CL_CONTEXT_PLATFORM 0x1084 #ifdef CL_VERSION_1_2 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 #endif #ifdef CL_VERSION_1_2 /* cl_device_partition_property */ #define CL_DEVICE_PARTITION_EQUALLY 0x1086 #define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 #endif #ifdef CL_VERSION_1_2 /* cl_device_affinity_domain */ #define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) #endif #ifdef CL_VERSION_2_0 /* cl_device_svm_capabilities */ #define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) #define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) #define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) #define CL_DEVICE_SVM_ATOMICS (1 << 3) #endif /* cl_command_queue_info */ #define CL_QUEUE_CONTEXT 0x1090 #define CL_QUEUE_DEVICE 0x1091 #define CL_QUEUE_REFERENCE_COUNT 0x1092 #define CL_QUEUE_PROPERTIES 0x1093 #ifdef CL_VERSION_2_0 #define CL_QUEUE_SIZE 0x1094 #endif #ifdef CL_VERSION_2_1 #define CL_QUEUE_DEVICE_DEFAULT 0x1095 #endif #ifdef CL_VERSION_3_0 #define CL_QUEUE_PROPERTIES_ARRAY 0x1098 #endif /* cl_mem_flags and cl_svm_mem_flags - bitfield */ #define CL_MEM_READ_WRITE (1 << 0) #define CL_MEM_WRITE_ONLY (1 << 1) #define CL_MEM_READ_ONLY (1 << 2) #define CL_MEM_USE_HOST_PTR (1 << 3) #define CL_MEM_ALLOC_HOST_PTR (1 << 4) #define CL_MEM_COPY_HOST_PTR (1 << 5) /* reserved (1 << 6) */ #ifdef CL_VERSION_1_2 #define CL_MEM_HOST_WRITE_ONLY (1 << 7) #define CL_MEM_HOST_READ_ONLY (1 << 8) #define CL_MEM_HOST_NO_ACCESS (1 << 9) #endif #ifdef CL_VERSION_2_0 #define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ #define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ #define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) #endif #ifdef CL_VERSION_1_2 /* cl_mem_migration_flags - bitfield */ #define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) #endif /* cl_channel_order */ #define CL_R 0x10B0 #define CL_A 0x10B1 #define CL_RG 0x10B2 #define CL_RA 0x10B3 #define CL_RGB 0x10B4 #define CL_RGBA 0x10B5 #define CL_BGRA 0x10B6 #define CL_ARGB 0x10B7 #define CL_INTENSITY 0x10B8 #define CL_LUMINANCE 0x10B9 #ifdef CL_VERSION_1_1 #define CL_Rx 0x10BA #define CL_RGx 0x10BB #define CL_RGBx 0x10BC #endif #ifdef CL_VERSION_1_2 #define CL_DEPTH 0x10BD #define CL_DEPTH_STENCIL 0x10BE #endif #ifdef CL_VERSION_2_0 #define CL_sRGB 0x10BF #define CL_sRGBx 0x10C0 #define CL_sRGBA 0x10C1 #define CL_sBGRA 0x10C2 #define CL_ABGR 0x10C3 #endif /* cl_channel_type */ #define CL_SNORM_INT8 0x10D0 #define CL_SNORM_INT16 0x10D1 #define CL_UNORM_INT8 0x10D2 #define CL_UNORM_INT16 0x10D3 #define CL_UNORM_SHORT_565 0x10D4 #define CL_UNORM_SHORT_555 0x10D5 #define CL_UNORM_INT_101010 0x10D6 #define CL_SIGNED_INT8 0x10D7 #define CL_SIGNED_INT16 0x10D8 #define CL_SIGNED_INT32 0x10D9 #define CL_UNSIGNED_INT8 0x10DA #define CL_UNSIGNED_INT16 0x10DB #define CL_UNSIGNED_INT32 0x10DC #define CL_HALF_FLOAT 0x10DD #define CL_FLOAT 0x10DE #ifdef CL_VERSION_1_2 #define CL_UNORM_INT24 0x10DF #endif #ifdef CL_VERSION_2_1 #define CL_UNORM_INT_101010_2 0x10E0 #endif /* cl_mem_object_type */ #define CL_MEM_OBJECT_BUFFER 0x10F0 #define CL_MEM_OBJECT_IMAGE2D 0x10F1 #define CL_MEM_OBJECT_IMAGE3D 0x10F2 #ifdef CL_VERSION_1_2 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 #endif #ifdef CL_VERSION_2_0 #define CL_MEM_OBJECT_PIPE 0x10F7 #endif /* cl_mem_info */ #define CL_MEM_TYPE 0x1100 #define CL_MEM_FLAGS 0x1101 #define CL_MEM_SIZE 0x1102 #define CL_MEM_HOST_PTR 0x1103 #define CL_MEM_MAP_COUNT 0x1104 #define CL_MEM_REFERENCE_COUNT 0x1105 #define CL_MEM_CONTEXT 0x1106 #ifdef CL_VERSION_1_1 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 #define CL_MEM_OFFSET 0x1108 #endif #ifdef CL_VERSION_2_0 #define CL_MEM_USES_SVM_POINTER 0x1109 #endif #ifdef CL_VERSION_3_0 #define CL_MEM_PROPERTIES 0x110A #endif /* cl_image_info */ #define CL_IMAGE_FORMAT 0x1110 #define CL_IMAGE_ELEMENT_SIZE 0x1111 #define CL_IMAGE_ROW_PITCH 0x1112 #define CL_IMAGE_SLICE_PITCH 0x1113 #define CL_IMAGE_WIDTH 0x1114 #define CL_IMAGE_HEIGHT 0x1115 #define CL_IMAGE_DEPTH 0x1116 #ifdef CL_VERSION_1_2 #define CL_IMAGE_ARRAY_SIZE 0x1117 #define CL_IMAGE_BUFFER 0x1118 #define CL_IMAGE_NUM_MIP_LEVELS 0x1119 #define CL_IMAGE_NUM_SAMPLES 0x111A #endif /* cl_pipe_info */ #ifdef CL_VERSION_2_0 #define CL_PIPE_PACKET_SIZE 0x1120 #define CL_PIPE_MAX_PACKETS 0x1121 #endif #ifdef CL_VERSION_3_0 #define CL_PIPE_PROPERTIES 0x1122 #endif /* cl_addressing_mode */ #define CL_ADDRESS_NONE 0x1130 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 #define CL_ADDRESS_CLAMP 0x1132 #define CL_ADDRESS_REPEAT 0x1133 #ifdef CL_VERSION_1_1 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 #endif /* cl_filter_mode */ #define CL_FILTER_NEAREST 0x1140 #define CL_FILTER_LINEAR 0x1141 /* cl_sampler_info */ #define CL_SAMPLER_REFERENCE_COUNT 0x1150 #define CL_SAMPLER_CONTEXT 0x1151 #define CL_SAMPLER_NORMALIZED_COORDS 0x1152 #define CL_SAMPLER_ADDRESSING_MODE 0x1153 #define CL_SAMPLER_FILTER_MODE 0x1154 #ifdef CL_VERSION_2_0 /* These enumerants are for the cl_khr_mipmap_image extension. They have since been added to cl_ext.h with an appropriate KHR suffix, but are left here for backwards compatibility. */ #define CL_SAMPLER_MIP_FILTER_MODE 0x1155 #define CL_SAMPLER_LOD_MIN 0x1156 #define CL_SAMPLER_LOD_MAX 0x1157 #endif #ifdef CL_VERSION_3_0 #define CL_SAMPLER_PROPERTIES 0x1158 #endif /* cl_map_flags - bitfield */ #define CL_MAP_READ (1 << 0) #define CL_MAP_WRITE (1 << 1) #ifdef CL_VERSION_1_2 #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) #endif /* cl_program_info */ #define CL_PROGRAM_REFERENCE_COUNT 0x1160 #define CL_PROGRAM_CONTEXT 0x1161 #define CL_PROGRAM_NUM_DEVICES 0x1162 #define CL_PROGRAM_DEVICES 0x1163 #define CL_PROGRAM_SOURCE 0x1164 #define CL_PROGRAM_BINARY_SIZES 0x1165 #define CL_PROGRAM_BINARIES 0x1166 #ifdef CL_VERSION_1_2 #define CL_PROGRAM_NUM_KERNELS 0x1167 #define CL_PROGRAM_KERNEL_NAMES 0x1168 #endif #ifdef CL_VERSION_2_1 #define CL_PROGRAM_IL 0x1169 #endif #ifdef CL_VERSION_2_2 #define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A #define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B #endif /* cl_program_build_info */ #define CL_PROGRAM_BUILD_STATUS 0x1181 #define CL_PROGRAM_BUILD_OPTIONS 0x1182 #define CL_PROGRAM_BUILD_LOG 0x1183 #ifdef CL_VERSION_1_2 #define CL_PROGRAM_BINARY_TYPE 0x1184 #endif #ifdef CL_VERSION_2_0 #define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 #endif #ifdef CL_VERSION_1_2 /* cl_program_binary_type */ #define CL_PROGRAM_BINARY_TYPE_NONE 0x0 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 #define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 #endif /* cl_build_status */ #define CL_BUILD_SUCCESS 0 #define CL_BUILD_NONE -1 #define CL_BUILD_ERROR -2 #define CL_BUILD_IN_PROGRESS -3 /* cl_kernel_info */ #define CL_KERNEL_FUNCTION_NAME 0x1190 #define CL_KERNEL_NUM_ARGS 0x1191 #define CL_KERNEL_REFERENCE_COUNT 0x1192 #define CL_KERNEL_CONTEXT 0x1193 #define CL_KERNEL_PROGRAM 0x1194 #ifdef CL_VERSION_1_2 #define CL_KERNEL_ATTRIBUTES 0x1195 #endif #ifdef CL_VERSION_1_2 /* cl_kernel_arg_info */ #define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 #define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 #define CL_KERNEL_ARG_TYPE_NAME 0x1198 #define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 #define CL_KERNEL_ARG_NAME 0x119A #endif #ifdef CL_VERSION_1_2 /* cl_kernel_arg_address_qualifier */ #define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B #define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C #define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D #define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E #endif #ifdef CL_VERSION_1_2 /* cl_kernel_arg_access_qualifier */ #define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 #define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 #define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 #endif #ifdef CL_VERSION_1_2 /* cl_kernel_arg_type_qualifier */ #define CL_KERNEL_ARG_TYPE_NONE 0 #define CL_KERNEL_ARG_TYPE_CONST (1 << 0) #define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) #define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) #ifdef CL_VERSION_2_0 #define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) #endif #endif /* cl_kernel_work_group_info */ #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 #ifdef CL_VERSION_1_2 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 #endif #ifdef CL_VERSION_2_1 /* cl_kernel_sub_group_info */ #define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 #define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 #define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 #define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 #define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA #endif #ifdef CL_VERSION_2_0 /* cl_kernel_exec_info */ #define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 #define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 #endif /* cl_event_info */ #define CL_EVENT_COMMAND_QUEUE 0x11D0 #define CL_EVENT_COMMAND_TYPE 0x11D1 #define CL_EVENT_REFERENCE_COUNT 0x11D2 #define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 #ifdef CL_VERSION_1_1 #define CL_EVENT_CONTEXT 0x11D4 #endif /* cl_command_type */ #define CL_COMMAND_NDRANGE_KERNEL 0x11F0 #define CL_COMMAND_TASK 0x11F1 #define CL_COMMAND_NATIVE_KERNEL 0x11F2 #define CL_COMMAND_READ_BUFFER 0x11F3 #define CL_COMMAND_WRITE_BUFFER 0x11F4 #define CL_COMMAND_COPY_BUFFER 0x11F5 #define CL_COMMAND_READ_IMAGE 0x11F6 #define CL_COMMAND_WRITE_IMAGE 0x11F7 #define CL_COMMAND_COPY_IMAGE 0x11F8 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA #define CL_COMMAND_MAP_BUFFER 0x11FB #define CL_COMMAND_MAP_IMAGE 0x11FC #define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD #define CL_COMMAND_MARKER 0x11FE #define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF #define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 #ifdef CL_VERSION_1_1 #define CL_COMMAND_READ_BUFFER_RECT 0x1201 #define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 #define CL_COMMAND_COPY_BUFFER_RECT 0x1203 #define CL_COMMAND_USER 0x1204 #endif #ifdef CL_VERSION_1_2 #define CL_COMMAND_BARRIER 0x1205 #define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 #define CL_COMMAND_FILL_BUFFER 0x1207 #define CL_COMMAND_FILL_IMAGE 0x1208 #endif #ifdef CL_VERSION_2_0 #define CL_COMMAND_SVM_FREE 0x1209 #define CL_COMMAND_SVM_MEMCPY 0x120A #define CL_COMMAND_SVM_MEMFILL 0x120B #define CL_COMMAND_SVM_MAP 0x120C #define CL_COMMAND_SVM_UNMAP 0x120D #endif #ifdef CL_VERSION_3_0 #define CL_COMMAND_SVM_MIGRATE_MEM 0x120E #endif /* command execution status */ #define CL_COMPLETE 0x0 #define CL_RUNNING 0x1 #define CL_SUBMITTED 0x2 #define CL_QUEUED 0x3 /* cl_buffer_create_type */ #ifdef CL_VERSION_1_1 #define CL_BUFFER_CREATE_TYPE_REGION 0x1220 #endif /* cl_profiling_info */ #define CL_PROFILING_COMMAND_QUEUED 0x1280 #define CL_PROFILING_COMMAND_SUBMIT 0x1281 #define CL_PROFILING_COMMAND_START 0x1282 #define CL_PROFILING_COMMAND_END 0x1283 #ifdef CL_VERSION_2_0 #define CL_PROFILING_COMMAND_COMPLETE 0x1284 #endif /* cl_device_atomic_capabilities - bitfield */ #ifdef CL_VERSION_3_0 #define CL_DEVICE_ATOMIC_ORDER_RELAXED (1 << 0) #define CL_DEVICE_ATOMIC_ORDER_ACQ_REL (1 << 1) #define CL_DEVICE_ATOMIC_ORDER_SEQ_CST (1 << 2) #define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM (1 << 3) #define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP (1 << 4) #define CL_DEVICE_ATOMIC_SCOPE_DEVICE (1 << 5) #define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES (1 << 6) #endif /* cl_khronos_vendor_id */ #define CL_KHRONOS_VENDOR_ID_CODEPLAY 0x10004 #ifdef CL_VERSION_3_0 /* cl_version */ #define CL_VERSION_MAJOR_BITS (10) #define CL_VERSION_MINOR_BITS (10) #define CL_VERSION_PATCH_BITS (12) #define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1) #define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1) #define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1) #define CL_VERSION_MAJOR(version) \ ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) #define CL_VERSION_MINOR(version) \ (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK) #define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK) #define CL_MAKE_VERSION(major, minor, patch) \ ((((major) & CL_VERSION_MAJOR_MASK) \ << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) | \ (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \ ((patch) & CL_VERSION_PATCH_MASK)) #endif /********************************************************************************************************/ /* Platform API */ extern CL_API_ENTRY cl_int CL_API_CALL clGetPlatformIDs(cl_uint num_entries, cl_platform_id * platforms, cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Device APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id * devices, cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clCreateSubDevices(cl_device_id in_device, const cl_device_partition_property * properties, cl_uint num_devices, cl_device_id * out_devices, cl_uint * num_devices_ret) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_VERSION_2_1 extern CL_API_ENTRY cl_int CL_API_CALL clSetDefaultDeviceCommandQueue(cl_context context, cl_device_id device, cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceAndHostTimer(cl_device_id device, cl_ulong* device_timestamp, cl_ulong* host_timestamp) CL_API_SUFFIX__VERSION_2_1; extern CL_API_ENTRY cl_int CL_API_CALL clGetHostTimer(cl_device_id device, cl_ulong * host_timestamp) CL_API_SUFFIX__VERSION_2_1; #endif /* Context APIs */ extern CL_API_ENTRY cl_context CL_API_CALL clCreateContext(const cl_context_properties * properties, cl_uint num_devices, const cl_device_id * devices, void (CL_CALLBACK * pfn_notify)(const char * errinfo, const void * private_info, size_t cb, void * user_data), void * user_data, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_context CL_API_CALL clCreateContextFromType(const cl_context_properties * properties, cl_device_type device_type, void (CL_CALLBACK * pfn_notify)(const char * errinfo, const void * private_info, size_t cb, void * user_data), void * user_data, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetContextInfo(cl_context context, cl_context_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Command Queue APIs */ #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithProperties(cl_context context, cl_device_id device, const cl_queue_properties * properties, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetCommandQueueInfo(cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Memory Object APIs */ extern CL_API_ENTRY cl_mem CL_API_CALL clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateSubBuffer(cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type buffer_create_type, const void * buffer_create_info, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; #endif #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format * image_format, const cl_image_desc * image_desc, void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_mem CL_API_CALL clCreatePipe(cl_context context, cl_mem_flags flags, cl_uint pipe_packet_size, cl_uint pipe_max_packets, const cl_pipe_properties * properties, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; #endif #ifdef CL_VERSION_3_0 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateBufferWithProperties(cl_context context, const cl_mem_properties * properties, cl_mem_flags flags, size_t size, void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateImageWithProperties(cl_context context, const cl_mem_properties * properties, cl_mem_flags flags, const cl_image_format * image_format, const cl_image_desc * image_desc, void * host_ptr, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetSupportedImageFormats(cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, cl_uint num_entries, cl_image_format * image_formats, cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetMemObjectInfo(cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetImageInfo(cl_mem image, cl_image_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_int CL_API_CALL clGetPipeInfo(cl_mem pipe, cl_pipe_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0; #endif #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_int CL_API_CALL clSetMemObjectDestructorCallback(cl_mem memobj, void (CL_CALLBACK * pfn_notify)(cl_mem memobj, void * user_data), void * user_data) CL_API_SUFFIX__VERSION_1_1; #endif /* SVM Allocation APIs */ #ifdef CL_VERSION_2_0 extern CL_API_ENTRY void * CL_API_CALL clSVMAlloc(cl_context context, cl_svm_mem_flags flags, size_t size, cl_uint alignment) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY void CL_API_CALL clSVMFree(cl_context context, void * svm_pointer) CL_API_SUFFIX__VERSION_2_0; #endif /* Sampler APIs */ #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_sampler CL_API_CALL clCreateSamplerWithProperties(cl_context context, const cl_sampler_properties * sampler_properties, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetSamplerInfo(cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Program Object APIs */ extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithSource(cl_context context, cl_uint count, const char ** strings, const size_t * lengths, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id * device_list, const size_t * lengths, const unsigned char ** binaries, cl_int * binary_status, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithBuiltInKernels(cl_context context, cl_uint num_devices, const cl_device_id * device_list, const char * kernel_names, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_VERSION_2_1 extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithIL(cl_context context, const void* il, size_t length, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clBuildProgram(cl_program program, cl_uint num_devices, const cl_device_id * device_list, const char * options, void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), void * user_data) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clCompileProgram(cl_program program, cl_uint num_devices, const cl_device_id * device_list, const char * options, cl_uint num_input_headers, const cl_program * input_headers, const char ** header_include_names, void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), void * user_data) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_program CL_API_CALL clLinkProgram(cl_context context, cl_uint num_devices, const cl_device_id * device_list, const char * options, cl_uint num_input_programs, const cl_program * input_programs, void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), void * user_data, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_VERSION_2_2 extern CL_API_ENTRY cl_int CL_API_CALL clSetProgramReleaseCallback(cl_program program, void (CL_CALLBACK * pfn_notify)(cl_program program, void * user_data), void * user_data) CL_API_SUFFIX__VERSION_2_2; extern CL_API_ENTRY cl_int CL_API_CALL clSetProgramSpecializationConstant(cl_program program, cl_uint spec_id, size_t spec_size, const void* spec_value) CL_API_SUFFIX__VERSION_2_2; #endif #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramInfo(cl_program program, cl_program_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetProgramBuildInfo(cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Kernel Object APIs */ extern CL_API_ENTRY cl_kernel CL_API_CALL clCreateKernel(cl_program program, const char * kernel_name, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clCreateKernelsInProgram(cl_program program, cl_uint num_kernels, cl_kernel * kernels, cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_1 extern CL_API_ENTRY cl_kernel CL_API_CALL clCloneKernel(cl_kernel source_kernel, cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void * arg_value) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointer(cl_kernel kernel, cl_uint arg_index, const void * arg_value) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void * param_value) CL_API_SUFFIX__VERSION_2_0; #endif extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelArgInfo(cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_1 extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelSubGroupInfo(cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name, size_t input_value_size, const void* input_value, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; #endif /* Event Object APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clWaitForEvents(cl_uint num_events, const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_event CL_API_CALL clCreateUserEvent(cl_context context, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; #endif extern CL_API_ENTRY cl_int CL_API_CALL clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_int CL_API_CALL clSetUserEventStatus(cl_event event, cl_int execution_status) CL_API_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clSetEventCallback(cl_event event, cl_int command_exec_callback_type, void (CL_CALLBACK * pfn_notify)(cl_event event, cl_int event_command_status, void * user_data), void * user_data) CL_API_SUFFIX__VERSION_1_1; #endif /* Profiling APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; /* Flush and Finish APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; /* Enqueued Commands APIs */ extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadBufferRect(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t * buffer_offset, const size_t * host_offset, const size_t * region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_1; #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t size, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteBufferRect(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, const size_t * buffer_offset, const size_t * host_offset, const size_t * region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_1; #endif #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueFillBuffer(cl_command_queue command_queue, cl_mem buffer, const void * pattern, size_t pattern_size, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBuffer(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferRect(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, const size_t * src_origin, const size_t * dst_origin, const size_t * region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_1; #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReadImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, const size_t * origin, const size_t * region, size_t row_pitch, size_t slice_pitch, void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueWriteImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t * origin, const size_t * region, size_t input_row_pitch, size_t input_slice_pitch, const void * ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueFillImage(cl_command_queue command_queue, cl_mem image, const void * fill_color, const size_t * origin, const size_t * region, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyImage(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, const size_t * src_origin, const size_t * dst_origin, const size_t * region, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyImageToBuffer(cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, const size_t * src_origin, const size_t * region, size_t dst_offset, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueCopyBufferToImage(cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, size_t src_offset, const size_t * dst_origin, const size_t * region, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY void * CL_API_CALL clEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY void * CL_API_CALL clEnqueueMapImage(cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, const size_t * origin, const size_t * region, size_t * image_row_pitch, size_t * image_slice_pitch, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueUnmapMemObject(cl_command_queue command_queue, cl_mem memobj, void * mapped_ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMigrateMemObjects(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem * mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t * global_work_offset, const size_t * global_work_size, const size_t * local_work_size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueNativeKernel(cl_command_queue command_queue, void (CL_CALLBACK * user_func)(void *), void * args, size_t cb_args, cl_uint num_mem_objects, const cl_mem * mem_list, const void ** args_mem_loc, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMarkerWithWaitList(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueBarrierWithWaitList(cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_VERSION_2_0 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMFree(cl_command_queue command_queue, cl_uint num_svm_pointers, void * svm_pointers[], void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void * user_data), void * user_data, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMemcpy(cl_command_queue command_queue, cl_bool blocking_copy, void * dst_ptr, const void * src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMemFill(cl_command_queue command_queue, void * svm_ptr, const void * pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void * svm_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMUnmap(cl_command_queue command_queue, void * svm_ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_0; #endif #ifdef CL_VERSION_2_1 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMigrateMem(cl_command_queue command_queue, cl_uint num_svm_pointers, const void ** svm_pointers, const size_t * sizes, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_2_1; #endif #ifdef CL_VERSION_1_2 /* Extension function access * * Returns the extension function address for the given function name, * or NULL if a valid function can not be found. The client must * check to make sure the address is not NULL, before using or * calling the returned function address. */ extern CL_API_ENTRY void * CL_API_CALL clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, const char * func_name) CL_API_SUFFIX__VERSION_1_2; #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS /* * WARNING: * This API introduces mutable state into the OpenCL implementation. It has been REMOVED * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. * * Software developers previously relying on this API are instructed to set the command queue * properties when creating the queue, instead. */ extern CL_API_ENTRY cl_int CL_API_CALL clSetCommandQueueProperty(cl_command_queue command_queue, cl_command_queue_properties properties, cl_bool enable, cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; #endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ /* Deprecated OpenCL 1.1 APIs */ extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage2D(cl_context context, cl_mem_flags flags, const cl_image_format * image_format, size_t image_width, size_t image_height, size_t image_row_pitch, void * host_ptr, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateImage3D(cl_context context, cl_mem_flags flags, const cl_image_format * image_format, size_t image_width, size_t image_height, size_t image_depth, size_t image_row_pitch, size_t image_slice_pitch, void * host_ptr, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueMarker(cl_command_queue command_queue, cl_event * event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueWaitForEvents(cl_command_queue command_queue, cl_uint num_events, const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; /* Deprecated OpenCL 2.0 APIs */ extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL clCreateCommandQueue(cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL clCreateSampler(cl_context context, cl_bool normalized_coords, cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d10.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_D3D10_H #define __OPENCL_CL_D3D10_H #include #include #include #ifdef __cplusplus extern "C" { #endif /****************************************************************************** * cl_khr_d3d10_sharing */ #define cl_khr_d3d10_sharing 1 typedef cl_uint cl_d3d10_device_source_khr; typedef cl_uint cl_d3d10_device_set_khr; /******************************************************************************/ /* Error Codes */ #define CL_INVALID_D3D10_DEVICE_KHR -1002 #define CL_INVALID_D3D10_RESOURCE_KHR -1003 #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 /* cl_d3d10_device_source_nv */ #define CL_D3D10_DEVICE_KHR 0x4010 #define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 /* cl_d3d10_device_set_nv */ #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 #define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 /* cl_context_info */ #define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C /* cl_mem_info */ #define CL_MEM_D3D10_RESOURCE_KHR 0x4015 /* cl_image_info */ #define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 /* cl_command_type */ #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 /******************************************************************************/ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, void * d3d_object, cl_d3d10_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id * devices, cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( cl_context context, cl_mem_flags flags, ID3D10Buffer * resource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( cl_context context, cl_mem_flags flags, ID3D10Texture2D * resource, UINT subresource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( cl_context context, cl_mem_flags flags, ID3D10Texture3D * resource, UINT subresource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_D3D10_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_d3d11.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_D3D11_H #define __OPENCL_CL_D3D11_H #include #include #include #ifdef __cplusplus extern "C" { #endif /****************************************************************************** * cl_khr_d3d11_sharing */ #define cl_khr_d3d11_sharing 1 typedef cl_uint cl_d3d11_device_source_khr; typedef cl_uint cl_d3d11_device_set_khr; /******************************************************************************/ /* Error Codes */ #define CL_INVALID_D3D11_DEVICE_KHR -1006 #define CL_INVALID_D3D11_RESOURCE_KHR -1007 #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 #define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 /* cl_d3d11_device_source */ #define CL_D3D11_DEVICE_KHR 0x4019 #define CL_D3D11_DXGI_ADAPTER_KHR 0x401A /* cl_d3d11_device_set */ #define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B #define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C /* cl_context_info */ #define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D #define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D /* cl_mem_info */ #define CL_MEM_D3D11_RESOURCE_KHR 0x401E /* cl_image_info */ #define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F /* cl_command_type */ #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 #define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 /******************************************************************************/ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, void * d3d_object, cl_d3d11_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id * devices, cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( cl_context context, cl_mem_flags flags, ID3D11Buffer * resource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( cl_context context, cl_mem_flags flags, ID3D11Texture2D * resource, UINT subresource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( cl_context context, cl_mem_flags flags, ID3D11Texture3D * resource, UINT subresource, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_D3D11_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H #define __OPENCL_CL_DX9_MEDIA_SHARING_H #include #include #ifdef __cplusplus extern "C" { #endif /******************************************************************************/ /* cl_khr_dx9_media_sharing */ #define cl_khr_dx9_media_sharing 1 typedef cl_uint cl_dx9_media_adapter_type_khr; typedef cl_uint cl_dx9_media_adapter_set_khr; #if defined(_WIN32) #include typedef struct _cl_dx9_surface_info_khr { IDirect3DSurface9 *resource; HANDLE shared_handle; } cl_dx9_surface_info_khr; #endif /******************************************************************************/ /* Error Codes */ #define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 #define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 #define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 #define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 /* cl_media_adapter_type_khr */ #define CL_ADAPTER_D3D9_KHR 0x2020 #define CL_ADAPTER_D3D9EX_KHR 0x2021 #define CL_ADAPTER_DXVA_KHR 0x2022 /* cl_media_adapter_set_khr */ #define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 #define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 /* cl_context_info */ #define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 #define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 #define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 /* cl_mem_info */ #define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 #define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 /* cl_image_info */ #define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A /* cl_command_type */ #define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B #define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C /******************************************************************************/ typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( cl_platform_id platform, cl_uint num_media_adapters, cl_dx9_media_adapter_type_khr * media_adapter_type, void * media_adapters, cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, cl_device_id * devices, cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( cl_context context, cl_mem_flags flags, cl_dx9_media_adapter_type_khr adapter_type, void * surface_info, cl_uint plane, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_2; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_dx9_media_sharing_intel.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /*****************************************************************************\ Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. File Name: cl_dx9_media_sharing_intel.h Abstract: Notes: \*****************************************************************************/ #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H #define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /*************************************** * cl_intel_dx9_media_sharing extension * ****************************************/ #define cl_intel_dx9_media_sharing 1 typedef cl_uint cl_dx9_device_source_intel; typedef cl_uint cl_dx9_device_set_intel; /* error codes */ #define CL_INVALID_DX9_DEVICE_INTEL -1010 #define CL_INVALID_DX9_RESOURCE_INTEL -1011 #define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 #define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 /* cl_dx9_device_source_intel */ #define CL_D3D9_DEVICE_INTEL 0x4022 #define CL_D3D9EX_DEVICE_INTEL 0x4070 #define CL_DXVA_DEVICE_INTEL 0x4071 /* cl_dx9_device_set_intel */ #define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 #define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 /* cl_context_info */ #define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 #define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 #define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 /* cl_mem_info */ #define CL_MEM_DX9_RESOURCE_INTEL 0x4027 #define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 /* cl_image_info */ #define CL_IMAGE_DX9_PLANE_INTEL 0x4075 /* cl_command_type */ #define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A #define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B /******************************************************************************/ extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9INTEL( cl_platform_id platform, cl_dx9_device_source_intel dx9_device_source, void* dx9_object, cl_dx9_device_set_intel dx9_device_set, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( cl_platform_id platform, cl_dx9_device_source_intel dx9_device_source, void* dx9_object, cl_dx9_device_set_intel dx9_device_set, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceINTEL( cl_context context, cl_mem_flags flags, IDirect3DSurface9* resource, HANDLE sharedHandle, UINT plane, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( cl_context context, cl_mem_flags flags, IDirect3DSurface9* resource, HANDLE sharedHandle, UINT plane, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9ObjectsINTEL( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9ObjectsINTEL( cl_command_queue command_queue, cl_uint num_objects, cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( cl_command_queue command_queue, cl_uint num_objects, cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_1; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_egl.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_EGL_H #define __OPENCL_CL_EGL_H #include #ifdef __cplusplus extern "C" { #endif /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ #define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F #define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D #define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E /* Error type for clCreateFromEGLImageKHR */ #define CL_INVALID_EGL_OBJECT_KHR -1093 #define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 /* CLeglImageKHR is an opaque handle to an EGLImage */ typedef void* CLeglImageKHR; /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ typedef void* CLeglDisplayKHR; /* CLeglSyncKHR is an opaque handle to an EGLSync object */ typedef void* CLeglSyncKHR; /* properties passed to clCreateFromEGLImageKHR */ typedef intptr_t cl_egl_image_properties_khr; #define cl_khr_egl_image 1 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromEGLImageKHR(cl_context context, CLeglDisplayKHR egldisplay, CLeglImageKHR eglimage, cl_mem_flags flags, const cl_egl_image_properties_khr * properties, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( cl_context context, CLeglDisplayKHR egldisplay, CLeglImageKHR eglimage, cl_mem_flags flags, const cl_egl_image_properties_khr * properties, cl_int * errcode_ret); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event); #define cl_khr_egl_event 1 extern CL_API_ENTRY cl_event CL_API_CALL clCreateEventFromEGLSyncKHR(cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, cl_int * errcode_ret); #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_EGL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_ext.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /* cl_ext.h contains OpenCL extensions which don't have external */ /* (OpenGL, D3D) dependencies. */ #ifndef __CL_EXT_H #define __CL_EXT_H #ifdef __cplusplus extern "C" { #endif #include /* cl_khr_fp64 extension - no extension #define since it has no functions */ /* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ #if CL_TARGET_OPENCL_VERSION <= 110 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 #endif /* cl_khr_fp16 extension - no extension #define since it has no functions */ #define CL_DEVICE_HALF_FP_CONFIG 0x1033 /* Memory object destruction * * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR * * Registers a user callback function that will be called when the memory object is deleted and its resources * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback * stack associated with memobj. The registered user callback functions are called in the reverse order in * which they were registered. The user callback functions are called and then the memory object is deleted * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be * notified when the memory referenced by host_ptr, specified when the memory object is created and used as * the storage bits for the memory object, can be reused or freed. * * The application may not call CL api's with the cl_mem object passed to the pfn_notify. * * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) * before using. */ #define cl_APPLE_SetMemObjectDestructor 1 cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj, void (* pfn_notify)(cl_mem memobj, void * user_data), void * user_data) CL_EXT_SUFFIX__VERSION_1_0; /* Context Logging Functions * * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) * before using. * * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger */ #define cl_APPLE_ContextLoggingFunctions 1 extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr, const void * private_info, size_t cb, void * user_data) CL_EXT_SUFFIX__VERSION_1_0; /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr, const void * private_info, size_t cb, void * user_data) CL_EXT_SUFFIX__VERSION_1_0; /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr, const void * private_info, size_t cb, void * user_data) CL_EXT_SUFFIX__VERSION_1_0; /************************ * cl_khr_icd extension * ************************/ #define cl_khr_icd 1 /* cl_platform_info */ #define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 /* Additional Error Codes */ #define CL_PLATFORM_NOT_FOUND_KHR -1001 extern CL_API_ENTRY cl_int CL_API_CALL clIcdGetPlatformIDsKHR(cl_uint num_entries, cl_platform_id * platforms, cl_uint * num_platforms); typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries, cl_platform_id * platforms, cl_uint * num_platforms); /******************************* * cl_khr_il_program extension * *******************************/ #define cl_khr_il_program 1 /* New property to clGetDeviceInfo for retrieving supported intermediate * languages */ #define CL_DEVICE_IL_VERSION_KHR 0x105B /* New property to clGetProgramInfo for retrieving for retrieving the IL of a * program */ #define CL_PROGRAM_IL_KHR 0x1169 extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithILKHR(cl_context context, const void * il, size_t length, cl_int * errcode_ret); typedef CL_API_ENTRY cl_program (CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context, const void * il, size_t length, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; /* Extension: cl_khr_image2d_from_buffer * * This extension allows a 2D image to be created from a cl_mem buffer without * a copy. The type associated with a 2D image created from a buffer in an * OpenCL program is image2d_t. Both the sampler and sampler-less read_image * built-in functions are supported for 2D images and 2D images created from * a buffer. Similarly, the write_image built-ins are also supported for 2D * images created from a buffer. * * When the 2D image from buffer is created, the client must specify the * width, height, image format (i.e. channel order and channel data type) * and optionally the row pitch. * * The pitch specified must be a multiple of * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels. * The base address of the buffer must be aligned to * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels. */ #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B /************************************** * cl_khr_initialize_memory extension * **************************************/ #define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 /************************************** * cl_khr_terminate_context extension * **************************************/ #define CL_CONTEXT_TERMINATED_KHR -1121 #define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 #define CL_CONTEXT_TERMINATE_KHR 0x2032 #define cl_khr_terminate_context 1 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; /* * Extension: cl_khr_spir * * This extension adds support to create an OpenCL program object from a * Standard Portable Intermediate Representation (SPIR) instance */ #define CL_DEVICE_SPIR_VERSIONS 0x40E0 #define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 /***************************************** * cl_khr_create_command_queue extension * *****************************************/ #define cl_khr_create_command_queue 1 typedef cl_bitfield cl_queue_properties_khr; extern CL_API_ENTRY cl_command_queue CL_API_CALL clCreateCommandQueueWithPropertiesKHR(cl_context context, cl_device_id device, const cl_queue_properties_khr* properties, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_command_queue (CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context, cl_device_id device, const cl_queue_properties_khr* properties, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; /****************************************** * cl_nv_device_attribute_query extension * ******************************************/ /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 #define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 #define CL_DEVICE_WARP_SIZE_NV 0x4003 #define CL_DEVICE_GPU_OVERLAP_NV 0x4004 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 #define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 /********************************* * cl_amd_device_attribute_query * *********************************/ #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 #define CL_DEVICE_TOPOLOGY_AMD 0x4037 #define CL_DEVICE_BOARD_NAME_AMD 0x4038 #define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 #define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 #define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 #define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 #define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 #define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 #define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049 #define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A #define CL_DEVICE_GFXIP_MINOR_AMD 0x404B #define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C #define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 #define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 #define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 #define CL_DEVICE_PCIE_ID_AMD 0x4034 /********************************* * cl_arm_printf extension *********************************/ #define CL_PRINTF_CALLBACK_ARM 0x40B0 #define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 /*********************************** * cl_ext_device_fission extension ***********************************/ #define cl_ext_device_fission 1 extern CL_API_ENTRY cl_int CL_API_CALL clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; extern CL_API_ENTRY cl_int CL_API_CALL clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; typedef cl_ulong cl_device_partition_property_ext; extern CL_API_ENTRY cl_int CL_API_CALL clCreateSubDevicesEXT(cl_device_id in_device, const cl_device_partition_property_ext * properties, cl_uint num_entries, cl_device_id * out_devices, cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int (CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device, const cl_device_partition_property_ext * properties, cl_uint num_entries, cl_device_id * out_devices, cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; /* cl_device_partition_property_ext */ #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 /* clDeviceGetInfo selectors */ #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 /* error codes */ #define CL_DEVICE_PARTITION_FAILED_EXT -1057 #define CL_INVALID_PARTITION_COUNT_EXT -1058 #define CL_INVALID_PARTITION_NAME_EXT -1059 /* CL_AFFINITY_DOMAINs */ #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 /* cl_device_partition_property_ext list terminators */ #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) /*********************************** * cl_ext_migrate_memobject extension definitions ***********************************/ #define cl_ext_migrate_memobject 1 typedef cl_bitfield cl_mem_migration_flags_ext; #define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 #define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem * mem_objects, cl_mem_migration_flags_ext flags, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event); typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem * mem_objects, cl_mem_migration_flags_ext flags, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event); /********************************* * cl_qcom_ext_host_ptr extension *********************************/ #define cl_qcom_ext_host_ptr 1 #define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) #define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 #define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 #define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 #define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 #define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 #define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 #define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 #define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 typedef cl_uint cl_image_pitch_info_qcom; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceImageInfoQCOM(cl_device_id device, size_t image_width, size_t image_height, const cl_image_format *image_format, cl_image_pitch_info_qcom param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret); typedef struct _cl_mem_ext_host_ptr { /* Type of external memory allocation. */ /* Legal values will be defined in layered extensions. */ cl_uint allocation_type; /* Host cache policy for this external memory allocation. */ cl_uint host_cache_policy; } cl_mem_ext_host_ptr; /******************************************* * cl_qcom_ext_host_ptr_iocoherent extension ********************************************/ /* Cache policy specifying io-coherence */ #define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 /********************************* * cl_qcom_ion_host_ptr extension *********************************/ #define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 typedef struct _cl_mem_ion_host_ptr { /* Type of external memory allocation. */ /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ cl_mem_ext_host_ptr ext_host_ptr; /* ION file descriptor */ int ion_filedesc; /* Host pointer to the ION allocated memory */ void* ion_hostptr; } cl_mem_ion_host_ptr; /********************************* * cl_qcom_android_native_buffer_host_ptr extension *********************************/ #define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 typedef struct _cl_mem_android_native_buffer_host_ptr { /* Type of external memory allocation. */ /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ cl_mem_ext_host_ptr ext_host_ptr; /* Virtual pointer to the android native buffer */ void* anb_ptr; } cl_mem_android_native_buffer_host_ptr; /****************************************** * cl_img_yuv_image extension * ******************************************/ /* Image formats used in clCreateImage */ #define CL_NV21_IMG 0x40D0 #define CL_YV12_IMG 0x40D1 /****************************************** * cl_img_cached_allocations extension * ******************************************/ /* Flag values used by clCreateBuffer */ #define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) #define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) /****************************************** * cl_img_use_gralloc_ptr extension * ******************************************/ #define cl_img_use_gralloc_ptr 1 /* Flag values used by clCreateBuffer */ #define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) /* To be used by clGetEventInfo: */ #define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 #define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 /* Error code from clEnqueueReleaseGrallocObjectsIMG */ #define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; /********************************* * cl_khr_subgroups extension *********************************/ #define cl_khr_subgroups 1 #if !defined(CL_VERSION_2_1) /* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. In hindsight, there should have been a khr suffix on this type for the extension, but keeping it un-suffixed to maintain backwards compatibility. */ typedef cl_uint cl_kernel_sub_group_info; #endif /* cl_kernel_sub_group_info */ #define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 #define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 extern CL_API_ENTRY cl_int CL_API_CALL clGetKernelSubGroupInfoKHR(cl_kernel in_kernel, cl_device_id in_device, cl_kernel_sub_group_info param_name, size_t input_value_size, const void * input_value, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel, cl_device_id in_device, cl_kernel_sub_group_info param_name, size_t input_value_size, const void * input_value, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; /********************************* * cl_khr_mipmap_image extension *********************************/ /* cl_sampler_properties */ #define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155 #define CL_SAMPLER_LOD_MIN_KHR 0x1156 #define CL_SAMPLER_LOD_MAX_KHR 0x1157 /********************************* * cl_khr_priority_hints extension *********************************/ /* This extension define is for backwards compatibility. It shouldn't be required since this extension has no new functions. */ #define cl_khr_priority_hints 1 typedef cl_uint cl_queue_priority_khr; /* cl_command_queue_properties */ #define CL_QUEUE_PRIORITY_KHR 0x1096 /* cl_queue_priority_khr */ #define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) #define CL_QUEUE_PRIORITY_MED_KHR (1<<1) #define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) /********************************* * cl_khr_throttle_hints extension *********************************/ /* This extension define is for backwards compatibility. It shouldn't be required since this extension has no new functions. */ #define cl_khr_throttle_hints 1 typedef cl_uint cl_queue_throttle_khr; /* cl_command_queue_properties */ #define CL_QUEUE_THROTTLE_KHR 0x1097 /* cl_queue_throttle_khr */ #define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) #define CL_QUEUE_THROTTLE_MED_KHR (1<<1) #define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) /********************************* * cl_khr_subgroup_named_barrier *********************************/ /* This extension define is for backwards compatibility. It shouldn't be required since this extension has no new functions. */ #define cl_khr_subgroup_named_barrier 1 /* cl_device_info */ #define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 /********************************* * cl_khr_extended_versioning *********************************/ #define cl_khr_extended_versioning 1 #define CL_VERSION_MAJOR_BITS_KHR (10) #define CL_VERSION_MINOR_BITS_KHR (10) #define CL_VERSION_PATCH_BITS_KHR (12) #define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1) #define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1) #define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1) #define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) #define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR) #define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR) #define CL_MAKE_VERSION_KHR(major, minor, patch) \ ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \ (((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \ ((patch) & CL_VERSION_PATCH_MASK_KHR)) typedef cl_uint cl_version_khr; #define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64 typedef struct _cl_name_version_khr { cl_version_khr version; char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR]; } cl_name_version_khr; /* cl_platform_info */ #define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906 #define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907 /* cl_device_info */ #define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E #define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F #define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060 #define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061 #define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062 /********************************* * cl_khr_device_uuid extension *********************************/ #define cl_khr_device_uuid 1 #define CL_UUID_SIZE_KHR 16 #define CL_LUID_SIZE_KHR 8 #define CL_DEVICE_UUID_KHR 0x106A #define CL_DRIVER_UUID_KHR 0x106B #define CL_DEVICE_LUID_VALID_KHR 0x106C #define CL_DEVICE_LUID_KHR 0x106D #define CL_DEVICE_NODE_MASK_KHR 0x106E /********************************** * cl_arm_import_memory extension * **********************************/ #define cl_arm_import_memory 1 typedef intptr_t cl_import_properties_arm; /* Default and valid proporties name for cl_arm_import_memory */ #define CL_IMPORT_TYPE_ARM 0x40B2 /* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ #define CL_IMPORT_TYPE_HOST_ARM 0x40B3 /* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ #define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 /* Protected memory property */ #define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5 /* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */ #define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2 /* Data consistency with host property */ #define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3 /* Import memory size value to indicate a size for the whole buffer */ #define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX /* This extension adds a new function that allows for direct memory import into * OpenCL via the clImportMemoryARM function. * * Memory imported through this interface will be mapped into the device's page * tables directly, providing zero copy access. It will never fall back to copy * operations and aliased buffers. * * Types of memory supported for import are specified as additional extension * strings. * * This extension produces cl_mem allocations which are compatible with all other * users of cl_mem in the standard API. * * This extension maps pages with the same properties as the normal buffer creation * function clCreateBuffer. */ extern CL_API_ENTRY cl_mem CL_API_CALL clImportMemoryARM( cl_context context, cl_mem_flags flags, const cl_import_properties_arm *properties, void *memory, size_t size, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; /****************************************** * cl_arm_shared_virtual_memory extension * ******************************************/ #define cl_arm_shared_virtual_memory 1 /* Used by clGetDeviceInfo */ #define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 /* Used by clGetMemObjectInfo */ #define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 /* Used by clSetKernelExecInfoARM: */ #define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 #define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 /* To be used by clGetEventInfo: */ #define CL_COMMAND_SVM_FREE_ARM 0x40BA #define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB #define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC #define CL_COMMAND_SVM_MAP_ARM 0x40BD #define CL_COMMAND_SVM_UNMAP_ARM 0x40BE /* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ #define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) #define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) #define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) #define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) /* Flag values used by clSVMAllocARM: */ #define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) #define CL_MEM_SVM_ATOMICS_ARM (1 << 11) typedef cl_bitfield cl_svm_mem_flags_arm; typedef cl_uint cl_kernel_exec_info_arm; typedef cl_bitfield cl_device_svm_capabilities_arm; extern CL_API_ENTRY void * CL_API_CALL clSVMAllocARM(cl_context context, cl_svm_mem_flags_arm flags, size_t size, cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY void CL_API_CALL clSVMFreeARM(cl_context context, void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMFreeARM(cl_command_queue command_queue, cl_uint num_svm_pointers, void * svm_pointers[], void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, cl_uint num_svm_pointers, void * svm_pointers[], void * user_data), void * user_data, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMemcpyARM(cl_command_queue command_queue, cl_bool blocking_copy, void * dst_ptr, const void * src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMemFillARM(cl_command_queue command_queue, void * svm_ptr, const void * pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMMapARM(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void * svm_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueSVMUnmapARM(cl_command_queue command_queue, void * svm_ptr, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgSVMPointerARM(cl_kernel kernel, cl_uint arg_index, const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelExecInfoARM(cl_kernel kernel, cl_kernel_exec_info_arm param_name, size_t param_value_size, const void * param_value) CL_EXT_SUFFIX__VERSION_1_2; /******************************** * cl_arm_get_core_id extension * ********************************/ #ifdef CL_VERSION_1_2 #define cl_arm_get_core_id 1 /* Device info property for bitfield of cores present */ #define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF #endif /* CL_VERSION_1_2 */ /********************************* * cl_arm_job_slot_selection *********************************/ #define cl_arm_job_slot_selection 1 /* cl_device_info */ #define CL_DEVICE_JOB_SLOTS_ARM 0x41E0 /* cl_command_queue_properties */ #define CL_QUEUE_JOB_SLOT_ARM 0x41E1 #ifdef __cplusplus } #endif #endif /* __CL_EXT_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_ext_intel.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /*****************************************************************************\ Copyright (c) 2013-2020 Intel Corporation All Rights Reserved. THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. File Name: cl_ext_intel.h Abstract: Notes: \*****************************************************************************/ #ifndef __CL_EXT_INTEL_H #define __CL_EXT_INTEL_H #include #include #ifdef __cplusplus extern "C" { #endif /*************************************** * cl_intel_thread_local_exec extension * ****************************************/ #define cl_intel_thread_local_exec 1 #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) /*********************************************** * cl_intel_device_partition_by_names extension * ************************************************/ #define cl_intel_device_partition_by_names 1 #define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 #define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 /************************************************ * cl_intel_accelerator extension * * cl_intel_motion_estimation extension * * cl_intel_advanced_motion_estimation extension * *************************************************/ #define cl_intel_accelerator 1 #define cl_intel_motion_estimation 1 #define cl_intel_advanced_motion_estimation 1 typedef struct _cl_accelerator_intel* cl_accelerator_intel; typedef cl_uint cl_accelerator_type_intel; typedef cl_uint cl_accelerator_info_intel; typedef struct _cl_motion_estimation_desc_intel { cl_uint mb_block_type; cl_uint subpixel_mode; cl_uint sad_adjust_mode; cl_uint search_path_type; } cl_motion_estimation_desc_intel; /* error codes */ #define CL_INVALID_ACCELERATOR_INTEL -1094 #define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 #define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 #define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 /* cl_accelerator_type_intel */ #define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 /* cl_accelerator_info_intel */ #define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 #define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 #define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 #define CL_ACCELERATOR_TYPE_INTEL 0x4093 /* cl_motion_detect_desc_intel flags */ #define CL_ME_MB_TYPE_16x16_INTEL 0x0 #define CL_ME_MB_TYPE_8x8_INTEL 0x1 #define CL_ME_MB_TYPE_4x4_INTEL 0x2 #define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 #define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 #define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 #define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 #define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 #define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 #define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 #define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 #define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 #define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 #define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 #define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 #define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 #define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 #define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 #define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 #define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 #define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 #define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 #define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 #define CL_ME_COST_PENALTY_NONE_INTEL 0x0 #define CL_ME_COST_PENALTY_LOW_INTEL 0x1 #define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 #define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 #define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 #define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 #define CL_ME_COST_PRECISION_PEL_INTEL 0x2 #define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 #define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 #define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 #define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 #define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 #define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 #define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 /* cl_device_info */ #define CL_DEVICE_ME_VERSION_INTEL 0x407E #define CL_ME_VERSION_LEGACY_INTEL 0x0 #define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 #define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL clCreateAcceleratorINTEL( cl_context context, cl_accelerator_type_intel accelerator_type, size_t descriptor_size, const void* descriptor, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( cl_context context, cl_accelerator_type_intel accelerator_type, size_t descriptor_size, const void* descriptor, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clGetAcceleratorInfoINTEL( cl_accelerator_intel accelerator, cl_accelerator_info_intel param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( cl_accelerator_intel accelerator, cl_accelerator_info_intel param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clRetainAcceleratorINTEL( cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clReleaseAcceleratorINTEL( cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2; /****************************************** * cl_intel_simultaneous_sharing extension * *******************************************/ #define cl_intel_simultaneous_sharing 1 #define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 #define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 /*********************************** * cl_intel_egl_image_yuv extension * ************************************/ #define cl_intel_egl_image_yuv 1 #define CL_EGL_YUV_PLANE_INTEL 0x4107 /******************************** * cl_intel_packed_yuv extension * *********************************/ #define cl_intel_packed_yuv 1 #define CL_YUYV_INTEL 0x4076 #define CL_UYVY_INTEL 0x4077 #define CL_YVYU_INTEL 0x4078 #define CL_VYUY_INTEL 0x4079 /******************************************** * cl_intel_required_subgroup_size extension * *********************************************/ #define cl_intel_required_subgroup_size 1 #define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 #define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A /**************************************** * cl_intel_driver_diagnostics extension * *****************************************/ #define cl_intel_driver_diagnostics 1 typedef cl_uint cl_diagnostics_verbose_level; #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) /******************************** * cl_intel_planar_yuv extension * *********************************/ #define CL_NV12_INTEL 0x410E #define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) #define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) #define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E #define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F /******************************************************* * cl_intel_device_side_avc_motion_estimation extension * ********************************************************/ #define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B #define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C #define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D #define CL_AVC_ME_VERSION_0_INTEL 0x0 /* No support. */ #define CL_AVC_ME_VERSION_1_INTEL 0x1 /* First supported version. */ #define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 #define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 #define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 #define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 #define CL_AVC_ME_MINOR_8x8_INTEL 0x0 #define CL_AVC_ME_MINOR_8x4_INTEL 0x1 #define CL_AVC_ME_MINOR_4x8_INTEL 0x2 #define CL_AVC_ME_MINOR_4x4_INTEL 0x3 #define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 #define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 #define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 #define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 #define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E #define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D #define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B #define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 #define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F #define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F #define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F #define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 #define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 #define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 #define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 #define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 #define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 #define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 #define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 #define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 #define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa #define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 #define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 #define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 #define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 #define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 #define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 #define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 #define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 #define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 #define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 #define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 #define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 #define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B #define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 #define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 #define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 #define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 #define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 #define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) #define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) #define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 #define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 #define CL_AVC_ME_INTRA_16x16_INTEL 0x0 #define CL_AVC_ME_INTRA_8x8_INTEL 0x1 #define CL_AVC_ME_INTRA_4x4_INTEL 0x2 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 #define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 #define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 #define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 #define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 #define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 #define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 #define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 #define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 #define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 /******************************************* * cl_intel_unified_shared_memory extension * ********************************************/ /* These APIs are in sync with Revision O of the cl_intel_unified_shared_memory spec! */ #define cl_intel_unified_shared_memory 1 /* cl_device_info */ #define CL_DEVICE_HOST_MEM_CAPABILITIES_INTEL 0x4190 #define CL_DEVICE_DEVICE_MEM_CAPABILITIES_INTEL 0x4191 #define CL_DEVICE_SINGLE_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4192 #define CL_DEVICE_CROSS_DEVICE_SHARED_MEM_CAPABILITIES_INTEL 0x4193 #define CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL 0x4194 typedef cl_bitfield cl_device_unified_shared_memory_capabilities_intel; /* cl_device_unified_shared_memory_capabilities_intel - bitfield */ #define CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL (1 << 0) #define CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL (1 << 1) #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL (1 << 2) #define CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL (1 << 3) typedef cl_bitfield cl_mem_properties_intel; /* cl_mem_properties_intel */ #define CL_MEM_ALLOC_FLAGS_INTEL 0x4195 typedef cl_bitfield cl_mem_alloc_flags_intel; /* cl_mem_alloc_flags_intel - bitfield */ #define CL_MEM_ALLOC_WRITE_COMBINED_INTEL (1 << 0) typedef cl_uint cl_mem_info_intel; /* cl_mem_alloc_info_intel */ #define CL_MEM_ALLOC_TYPE_INTEL 0x419A #define CL_MEM_ALLOC_BASE_PTR_INTEL 0x419B #define CL_MEM_ALLOC_SIZE_INTEL 0x419C #define CL_MEM_ALLOC_DEVICE_INTEL 0x419D /* Enum values 0x419E-0x419F are reserved for future queries. */ typedef cl_uint cl_unified_shared_memory_type_intel; /* cl_unified_shared_memory_type_intel */ #define CL_MEM_TYPE_UNKNOWN_INTEL 0x4196 #define CL_MEM_TYPE_HOST_INTEL 0x4197 #define CL_MEM_TYPE_DEVICE_INTEL 0x4198 #define CL_MEM_TYPE_SHARED_INTEL 0x4199 typedef cl_uint cl_mem_advice_intel; /* cl_mem_advice_intel */ /* Enum values 0x4208-0x420F are reserved for future memory advices. */ /* cl_kernel_exec_info */ #define CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL 0x4200 #define CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL 0x4201 #define CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL 0x4202 #define CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL 0x4203 /* cl_command_type */ #define CL_COMMAND_MEMFILL_INTEL 0x4204 #define CL_COMMAND_MEMCPY_INTEL 0x4205 #define CL_COMMAND_MIGRATEMEM_INTEL 0x4206 #define CL_COMMAND_MEMADVISE_INTEL 0x4207 extern CL_API_ENTRY void* CL_API_CALL clHostMemAllocINTEL( cl_context context, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); typedef CL_API_ENTRY void* (CL_API_CALL * clHostMemAllocINTEL_fn)( cl_context context, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); extern CL_API_ENTRY void* CL_API_CALL clDeviceMemAllocINTEL( cl_context context, cl_device_id device, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); typedef CL_API_ENTRY void* (CL_API_CALL * clDeviceMemAllocINTEL_fn)( cl_context context, cl_device_id device, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); extern CL_API_ENTRY void* CL_API_CALL clSharedMemAllocINTEL( cl_context context, cl_device_id device, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); typedef CL_API_ENTRY void* (CL_API_CALL * clSharedMemAllocINTEL_fn)( cl_context context, cl_device_id device, const cl_mem_properties_intel* properties, size_t size, cl_uint alignment, cl_int* errcode_ret); extern CL_API_ENTRY cl_int CL_API_CALL clMemFreeINTEL( cl_context context, void* ptr); typedef CL_API_ENTRY cl_int (CL_API_CALL * clMemFreeINTEL_fn)( cl_context context, void* ptr); extern CL_API_ENTRY cl_int CL_API_CALL clGetMemAllocInfoINTEL( cl_context context, const void* ptr, cl_mem_info_intel param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret); typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetMemAllocInfoINTEL_fn)( cl_context context, const void* ptr, cl_mem_info_intel param_name, size_t param_value_size, void* param_value, size_t* param_value_size_ret); extern CL_API_ENTRY cl_int CL_API_CALL clSetKernelArgMemPointerINTEL( cl_kernel kernel, cl_uint arg_index, const void* arg_value); typedef CL_API_ENTRY cl_int (CL_API_CALL * clSetKernelArgMemPointerINTEL_fn)( cl_kernel kernel, cl_uint arg_index, const void* arg_value); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemsetINTEL( /* Deprecated */ cl_command_queue command_queue, void* dst_ptr, cl_int value, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); typedef CL_API_ENTRY cl_int (CL_API_CALL * clEnqueueMemsetINTEL_fn)( /* Deprecated */ cl_command_queue command_queue, void* dst_ptr, cl_int value, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemFillINTEL( cl_command_queue command_queue, void* dst_ptr, const void* pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); typedef CL_API_ENTRY cl_int (CL_API_CALL * clEnqueueMemFillINTEL_fn)( cl_command_queue command_queue, void* dst_ptr, const void* pattern, size_t pattern_size, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemcpyINTEL( cl_command_queue command_queue, cl_bool blocking, void* dst_ptr, const void* src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); typedef CL_API_ENTRY cl_int (CL_API_CALL * clEnqueueMemcpyINTEL_fn)( cl_command_queue command_queue, cl_bool blocking, void* dst_ptr, const void* src_ptr, size_t size, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); #ifdef CL_VERSION_1_2 /* Because these APIs use cl_mem_migration_flags, they require OpenCL 1.2: */ extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMigrateMemINTEL( cl_command_queue command_queue, const void* ptr, size_t size, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); typedef CL_API_ENTRY cl_int (CL_API_CALL * clEnqueueMigrateMemINTEL_fn)( cl_command_queue command_queue, const void* ptr, size_t size, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); #endif extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueMemAdviseINTEL( cl_command_queue command_queue, const void* ptr, size_t size, cl_mem_advice_intel advice, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); typedef CL_API_ENTRY cl_int (CL_API_CALL * clEnqueueMemAdviseINTEL_fn)( cl_command_queue command_queue, const void* ptr, size_t size, cl_mem_advice_intel advice, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event); #ifdef __cplusplus } #endif #endif /* __CL_EXT_INTEL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_gl.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_GL_H #define __OPENCL_CL_GL_H #include #ifdef __cplusplus extern "C" { #endif typedef cl_uint cl_gl_object_type; typedef cl_uint cl_gl_texture_info; typedef cl_uint cl_gl_platform_info; typedef struct __GLsync *cl_GLsync; /* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ #define CL_GL_OBJECT_BUFFER 0x2000 #define CL_GL_OBJECT_TEXTURE2D 0x2001 #define CL_GL_OBJECT_TEXTURE3D 0x2002 #define CL_GL_OBJECT_RENDERBUFFER 0x2003 #ifdef CL_VERSION_1_2 #define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E #define CL_GL_OBJECT_TEXTURE1D 0x200F #define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 #define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 #endif /* cl_gl_texture_info */ #define CL_GL_TEXTURE_TARGET 0x2004 #define CL_GL_MIPMAP_LEVEL 0x2005 #ifdef CL_VERSION_1_2 #define CL_GL_NUM_SAMPLES 0x2012 #endif extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromGLBuffer(cl_context context, cl_mem_flags flags, cl_GLuint bufobj, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromGLTexture(cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; #endif extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromGLRenderbuffer(cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer, cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetGLObjectInfo(cl_mem memobj, cl_gl_object_type * gl_object_type, cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetGLTextureInfo(cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireGLObjects(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseGLObjects(cl_command_queue command_queue, cl_uint num_objects, const cl_mem * mem_objects, cl_uint num_events_in_wait_list, const cl_event * event_wait_list, cl_event * event) CL_API_SUFFIX__VERSION_1_0; /* Deprecated OpenCL 1.1 APIs */ extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateFromGLTexture2D(cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL clCreateFromGLTexture3D(cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; /* cl_khr_gl_sharing extension */ #define cl_khr_gl_sharing 1 typedef cl_uint cl_gl_context_info; /* Additional Error Codes */ #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 /* cl_gl_context_info */ #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 #define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 /* Additional cl_context_properties */ #define CL_GL_CONTEXT_KHR 0x2008 #define CL_EGL_DISPLAY_KHR 0x2009 #define CL_GLX_DISPLAY_KHR 0x200A #define CL_WGL_HDC_KHR 0x200B #define CL_CGL_SHAREGROUP_KHR 0x200C extern CL_API_ENTRY cl_int CL_API_CALL clGetGLContextInfoKHR(const cl_context_properties * properties, cl_gl_context_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( const cl_context_properties * properties, cl_gl_context_info param_name, size_t param_value_size, void * param_value, size_t * param_value_size_ret); #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_GL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_gl_ext.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_CL_GL_EXT_H #define __OPENCL_CL_GL_EXT_H #ifdef __cplusplus extern "C" { #endif #include /* * cl_khr_gl_event extension */ #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D extern CL_API_ENTRY cl_event CL_API_CALL clCreateEventFromGLsyncKHR(cl_context context, cl_GLsync cl_GLsync, cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_GL_EXT_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_half.h ================================================ /******************************************************************************* * Copyright (c) 2019-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /** * This is a header-only utility library that provides OpenCL host code with * routines for converting to/from cl_half values. * * Example usage: * * #include * ... * cl_half h = cl_half_from_float(0.5f, CL_HALF_RTE); * cl_float f = cl_half_to_float(h); */ #ifndef OPENCL_CL_HALF_H #define OPENCL_CL_HALF_H #include #include #ifdef __cplusplus extern "C" { #endif /** * Rounding mode used when converting to cl_half. */ typedef enum { CL_HALF_RTE, // round to nearest even CL_HALF_RTZ, // round towards zero CL_HALF_RTP, // round towards positive infinity CL_HALF_RTN, // round towards negative infinity } cl_half_rounding_mode; /* Private utility macros. */ #define CL_HALF_EXP_MASK 0x7C00 #define CL_HALF_MAX_FINITE_MAG 0x7BFF /* * Utility to deal with values that overflow when converting to half precision. */ static inline cl_half cl_half_handle_overflow(cl_half_rounding_mode rounding_mode, uint16_t sign) { if (rounding_mode == CL_HALF_RTZ) { // Round overflow towards zero -> largest finite number (preserving sign) return (sign << 15) | CL_HALF_MAX_FINITE_MAG; } else if (rounding_mode == CL_HALF_RTP && sign) { // Round negative overflow towards positive infinity -> most negative finite number return (1 << 15) | CL_HALF_MAX_FINITE_MAG; } else if (rounding_mode == CL_HALF_RTN && !sign) { // Round positive overflow towards negative infinity -> largest finite number return CL_HALF_MAX_FINITE_MAG; } // Overflow to infinity return (sign << 15) | CL_HALF_EXP_MASK; } /* * Utility to deal with values that underflow when converting to half precision. */ static inline cl_half cl_half_handle_underflow(cl_half_rounding_mode rounding_mode, uint16_t sign) { if (rounding_mode == CL_HALF_RTP && !sign) { // Round underflow towards positive infinity -> smallest positive value return (sign << 15) | 1; } else if (rounding_mode == CL_HALF_RTN && sign) { // Round underflow towards negative infinity -> largest negative value return (sign << 15) | 1; } // Flush to zero return (sign << 15); } /** * Convert a cl_float to a cl_half. */ static inline cl_half cl_half_from_float(cl_float f, cl_half_rounding_mode rounding_mode) { // Type-punning to get direct access to underlying bits union { cl_float f; uint32_t i; } f32; f32.f = f; // Extract sign bit uint16_t sign = f32.i >> 31; // Extract FP32 exponent and mantissa uint32_t f_exp = (f32.i >> (CL_FLT_MANT_DIG - 1)) & 0xFF; uint32_t f_mant = f32.i & ((1 << (CL_FLT_MANT_DIG - 1)) - 1); // Remove FP32 exponent bias int32_t exp = f_exp - CL_FLT_MAX_EXP + 1; // Add FP16 exponent bias uint16_t h_exp = exp + CL_HALF_MAX_EXP - 1; // Position of the bit that will become the FP16 mantissa LSB uint32_t lsb_pos = CL_FLT_MANT_DIG - CL_HALF_MANT_DIG; // Check for NaN / infinity if (f_exp == 0xFF) { if (f_mant) { // NaN -> propagate mantissa and silence it uint16_t h_mant = f_mant >> lsb_pos; h_mant |= 0x200; return (sign << 15) | CL_HALF_EXP_MASK | h_mant; } else { // Infinity -> zero mantissa return (sign << 15) | CL_HALF_EXP_MASK; } } // Check for zero if (!f_exp && !f_mant) { return (sign << 15); } // Check for overflow if (exp >= CL_HALF_MAX_EXP) { return cl_half_handle_overflow(rounding_mode, sign); } // Check for underflow if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) { return cl_half_handle_underflow(rounding_mode, sign); } // Check for value that will become denormal if (exp < -14) { // Denormal -> include the implicit 1 from the FP32 mantissa h_exp = 0; f_mant |= 1 << (CL_FLT_MANT_DIG - 1); // Mantissa shift amount depends on exponent lsb_pos = -exp + (CL_FLT_MANT_DIG - 25); } // Generate FP16 mantissa by shifting FP32 mantissa uint16_t h_mant = f_mant >> lsb_pos; // Check whether we need to round uint32_t halfway = 1 << (lsb_pos - 1); uint32_t mask = (halfway << 1) - 1; switch (rounding_mode) { case CL_HALF_RTE: if ((f_mant & mask) > halfway) { // More than halfway -> round up h_mant += 1; } else if ((f_mant & mask) == halfway) { // Exactly halfway -> round to nearest even if (h_mant & 0x1) h_mant += 1; } break; case CL_HALF_RTZ: // Mantissa has already been truncated -> do nothing break; case CL_HALF_RTP: if ((f_mant & mask) && !sign) { // Round positive numbers up h_mant += 1; } break; case CL_HALF_RTN: if ((f_mant & mask) && sign) { // Round negative numbers down h_mant += 1; } break; } // Check for mantissa overflow if (h_mant & 0x400) { h_exp += 1; h_mant = 0; } return (sign << 15) | (h_exp << 10) | h_mant; } /** * Convert a cl_double to a cl_half. */ static inline cl_half cl_half_from_double(cl_double d, cl_half_rounding_mode rounding_mode) { // Type-punning to get direct access to underlying bits union { cl_double d; uint64_t i; } f64; f64.d = d; // Extract sign bit uint16_t sign = f64.i >> 63; // Extract FP64 exponent and mantissa uint64_t d_exp = (f64.i >> (CL_DBL_MANT_DIG - 1)) & 0x7FF; uint64_t d_mant = f64.i & (((uint64_t)1 << (CL_DBL_MANT_DIG - 1)) - 1); // Remove FP64 exponent bias int64_t exp = d_exp - CL_DBL_MAX_EXP + 1; // Add FP16 exponent bias uint16_t h_exp = (uint16_t)(exp + CL_HALF_MAX_EXP - 1); // Position of the bit that will become the FP16 mantissa LSB uint32_t lsb_pos = CL_DBL_MANT_DIG - CL_HALF_MANT_DIG; // Check for NaN / infinity if (d_exp == 0x7FF) { if (d_mant) { // NaN -> propagate mantissa and silence it uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); h_mant |= 0x200; return (sign << 15) | CL_HALF_EXP_MASK | h_mant; } else { // Infinity -> zero mantissa return (sign << 15) | CL_HALF_EXP_MASK; } } // Check for zero if (!d_exp && !d_mant) { return (sign << 15); } // Check for overflow if (exp >= CL_HALF_MAX_EXP) { return cl_half_handle_overflow(rounding_mode, sign); } // Check for underflow if (exp < (CL_HALF_MIN_EXP - CL_HALF_MANT_DIG - 1)) { return cl_half_handle_underflow(rounding_mode, sign); } // Check for value that will become denormal if (exp < -14) { // Include the implicit 1 from the FP64 mantissa h_exp = 0; d_mant |= (uint64_t)1 << (CL_DBL_MANT_DIG - 1); // Mantissa shift amount depends on exponent lsb_pos = (uint32_t)(-exp + (CL_DBL_MANT_DIG - 25)); } // Generate FP16 mantissa by shifting FP64 mantissa uint16_t h_mant = (uint16_t)(d_mant >> lsb_pos); // Check whether we need to round uint64_t halfway = (uint64_t)1 << (lsb_pos - 1); uint64_t mask = (halfway << 1) - 1; switch (rounding_mode) { case CL_HALF_RTE: if ((d_mant & mask) > halfway) { // More than halfway -> round up h_mant += 1; } else if ((d_mant & mask) == halfway) { // Exactly halfway -> round to nearest even if (h_mant & 0x1) h_mant += 1; } break; case CL_HALF_RTZ: // Mantissa has already been truncated -> do nothing break; case CL_HALF_RTP: if ((d_mant & mask) && !sign) { // Round positive numbers up h_mant += 1; } break; case CL_HALF_RTN: if ((d_mant & mask) && sign) { // Round negative numbers down h_mant += 1; } break; } // Check for mantissa overflow if (h_mant & 0x400) { h_exp += 1; h_mant = 0; } return (sign << 15) | (h_exp << 10) | h_mant; } /** * Convert a cl_half to a cl_float. */ static inline cl_float cl_half_to_float(cl_half h) { // Type-punning to get direct access to underlying bits union { cl_float f; uint32_t i; } f32; // Extract sign bit uint16_t sign = h >> 15; // Extract FP16 exponent and mantissa uint16_t h_exp = (h >> (CL_HALF_MANT_DIG - 1)) & 0x1F; uint16_t h_mant = h & 0x3FF; // Remove FP16 exponent bias int32_t exp = h_exp - CL_HALF_MAX_EXP + 1; // Add FP32 exponent bias uint32_t f_exp = exp + CL_FLT_MAX_EXP - 1; // Check for NaN / infinity if (h_exp == 0x1F) { if (h_mant) { // NaN -> propagate mantissa and silence it uint32_t f_mant = h_mant << (CL_FLT_MANT_DIG - CL_HALF_MANT_DIG); f_mant |= 0x400000; f32.i = (sign << 31) | 0x7F800000 | f_mant; return f32.f; } else { // Infinity -> zero mantissa f32.i = (sign << 31) | 0x7F800000; return f32.f; } } // Check for zero / denormal if (h_exp == 0) { if (h_mant == 0) { // Zero -> zero exponent f_exp = 0; } else { // Denormal -> normalize it // - Shift mantissa to make most-significant 1 implicit // - Adjust exponent accordingly uint32_t shift = 0; while ((h_mant & 0x400) == 0) { h_mant <<= 1; shift++; } h_mant &= 0x3FF; f_exp -= shift - 1; } } f32.i = (sign << 31) | (f_exp << 23) | (h_mant << 13); return f32.f; } #undef CL_HALF_EXP_MASK #undef CL_HALF_MAX_FINITE_MAG #ifdef __cplusplus } #endif #endif /* OPENCL_CL_HALF_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_icd.h ================================================ /******************************************************************************* * Copyright (c) 2019-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef OPENCL_CL_ICD_H #define OPENCL_CL_ICD_H #include #include #include #include #if defined(_WIN32) #include #include #include #endif #ifdef __cplusplus extern "C" { #endif /* * This file contains pointer type definitions for each of the CL API calls as * well as a type definition for the dispatch table used by the Khronos ICD * loader (see cl_khr_icd extension specification for background). */ /* API function pointer definitions */ // Platform APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformIDs)( cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPlatformInfo)( cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; // Device APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDs)( cl_platform_id platform, cl_device_type device_type, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceInfo)( cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevices)( cl_device_id in_device, const cl_device_partition_property *partition_properties, cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDevice)( cl_device_id device) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDevice)( cl_device_id device) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clCreateSubDevices; typedef void *cl_api_clRetainDevice; typedef void *cl_api_clReleaseDevice; #endif // Context APIs typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContext)( const cl_context_properties *properties, cl_uint num_devices, const cl_device_id *devices, void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_context(CL_API_CALL *cl_api_clCreateContextFromType)( const cl_context_properties *properties, cl_device_type device_type, void(CL_CALLBACK *pfn_notify)(const char *, const void *, size_t, void *), void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainContext)( cl_context context) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseContext)( cl_context context) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetContextInfo)( cl_context context, cl_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; // Command Queue APIs typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueue)( cl_context context, cl_device_id device, cl_command_queue_properties properties, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 typedef CL_API_ENTRY cl_command_queue(CL_API_CALL *cl_api_clCreateCommandQueueWithProperties)( cl_context /* context */, cl_device_id /* device */, const cl_queue_properties * /* properties */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; #else typedef void *cl_api_clCreateCommandQueueWithProperties; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainCommandQueue)( cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseCommandQueue)( cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetCommandQueueInfo)( cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; // Memory Object APIs typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBuffer)( cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage)( cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clCreateImage; #endif #ifdef CL_VERSION_3_0 typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateBufferWithProperties)( cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImageWithProperties)( cl_context context, const cl_mem_properties *properties, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_3_0; #else typedef void *cl_api_clCreateBufferWithProperties; typedef void *cl_api_clCreateImageWithProperties; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainMemObject)( cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseMemObject)( cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSupportedImageFormats)( cl_context context, cl_mem_flags flags, cl_mem_object_type image_type, cl_uint num_entries, cl_image_format *image_formats, cl_uint *num_image_formats) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetMemObjectInfo)( cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetImageInfo)( cl_mem image, cl_image_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreatePipe)( cl_context /* context */, cl_mem_flags /* flags */, cl_uint /* pipe_packet_size */, cl_uint /* pipe_max_packets */, const cl_pipe_properties * /* properties */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetPipeInfo)( cl_mem /* pipe */, cl_pipe_info /* param_name */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clSVMAlloc)( cl_context /* context */, cl_svm_mem_flags /* flags */, size_t /* size */, unsigned int /* alignment */)CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY void(CL_API_CALL *cl_api_clSVMFree)( cl_context /* context */, void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; #else typedef void *cl_api_clCreatePipe; typedef void *cl_api_clGetPipeInfo; typedef void *cl_api_clSVMAlloc; typedef void *cl_api_clSVMFree; #endif // Sampler APIs typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSampler)( cl_context context, cl_bool normalized_coords, cl_addressing_mode addressing_mode, cl_filter_mode filter_mode, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainSampler)( cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseSampler)( cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetSamplerInfo)( cl_sampler sampler, cl_sampler_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 typedef CL_API_ENTRY cl_sampler(CL_API_CALL *cl_api_clCreateSamplerWithProperties)( cl_context /* context */, const cl_sampler_properties * /* sampler_properties */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; #else typedef void *cl_api_clCreateSamplerWithProperties; #endif // Program Object APIs typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithSource)( cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBinary)( cl_context context, cl_uint num_devices, const cl_device_id *device_list, const size_t *lengths, const unsigned char **binaries, cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithBuiltInKernels)( cl_context context, cl_uint num_devices, const cl_device_id *device_list, const char *kernel_names, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clCreateProgramWithBuiltInKernels; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainProgram)( cl_program program) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseProgram)( cl_program program) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clBuildProgram)( cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCompileProgram)( cl_program program, cl_uint num_devices, const cl_device_id *device_list, const char *options, cl_uint num_input_headers, const cl_program *input_headers, const char **header_include_names, void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clLinkProgram)( cl_context context, cl_uint num_devices, const cl_device_id *device_list, const char *options, cl_uint num_input_programs, const cl_program *input_programs, void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clCompileProgram; typedef void *cl_api_clLinkProgram; #endif #ifdef CL_VERSION_2_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramSpecializationConstant)( cl_program program, cl_uint spec_id, size_t spec_size, const void *spec_value) CL_API_SUFFIX__VERSION_2_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetProgramReleaseCallback)( cl_program program, void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data), void *user_data) CL_API_SUFFIX__VERSION_2_2; #else typedef void *cl_api_clSetProgramSpecializationConstant; typedef void *cl_api_clSetProgramReleaseCallback; #endif #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadPlatformCompiler)( cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clUnloadPlatformCompiler; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramInfo)( cl_program program, cl_program_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetProgramBuildInfo)( cl_program program, cl_device_id device, cl_program_build_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; // Kernel Object APIs typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCreateKernel)( cl_program program, const char *kernel_name, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateKernelsInProgram)( cl_program program, cl_uint num_kernels, cl_kernel *kernels, cl_uint *num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainKernel)( cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseKernel)( cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArg)( cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelInfo)( cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelArgInfo)( cl_kernel kernel, cl_uint arg_indx, cl_kernel_arg_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clGetKernelArgInfo; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelWorkGroupInfo)( cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_2_0 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelArgSVMPointer)( cl_kernel /* kernel */, cl_uint /* arg_index */, const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetKernelExecInfo)( cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, size_t /* param_value_size */, const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfoKHR)( cl_kernel /* in_kernel */, cl_device_id /*in_device*/, cl_kernel_sub_group_info /* param_name */, size_t /*input_value_size*/, const void * /*input_value*/, size_t /*param_value_size*/, void * /*param_value*/, size_t * /*param_value_size_ret*/) CL_EXT_SUFFIX__VERSION_2_0; #else typedef void *cl_api_clSetKernelArgSVMPointer; typedef void *cl_api_clSetKernelExecInfo; typedef void *cl_api_clGetKernelSubGroupInfoKHR; #endif // Event Object APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clWaitForEvents)( cl_uint num_events, const cl_event *event_list) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventInfo)( cl_event event, cl_event_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainEvent)(cl_event event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseEvent)(cl_event event) CL_API_SUFFIX__VERSION_1_0; // Profiling APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetEventProfilingInfo)( cl_event event, cl_profiling_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; // Flush and Finish APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFlush)( cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clFinish)( cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; // Enqueued Commands APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBuffer)( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadBufferRect)( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t *buffer_origin, const size_t *host_origin, const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_1; #else typedef void *cl_api_clEnqueueReadBufferRect; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBuffer)( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteBufferRect)( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_read, const size_t *buffer_origin, const size_t *host_origin, const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_1; #else typedef void *cl_api_clEnqueueWriteBufferRect; #endif #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillBuffer)( cl_command_queue command_queue, cl_mem buffer, const void *pattern, size_t pattern_size, size_t offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clEnqueueFillBuffer; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBuffer)( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, size_t src_offset, size_t dst_offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_1 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferRect)( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, const size_t *src_origin, const size_t *dst_origin, const size_t *region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_1; #else typedef void *cl_api_clEnqueueCopyBufferRect; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReadImage)( cl_command_queue command_queue, cl_mem image, cl_bool blocking_read, const size_t *origin, const size_t *region, size_t row_pitch, size_t slice_pitch, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWriteImage)( cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, const size_t *origin, const size_t *region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueFillImage)( cl_command_queue command_queue, cl_mem image, const void *fill_color, const size_t origin[3], const size_t region[3], cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clEnqueueFillImage; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImage)( cl_command_queue command_queue, cl_mem src_image, cl_mem dst_image, const size_t *src_origin, const size_t *dst_origin, const size_t *region, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyImageToBuffer)( cl_command_queue command_queue, cl_mem src_image, cl_mem dst_buffer, const size_t *src_origin, const size_t *region, size_t dst_offset, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueCopyBufferToImage)( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, size_t src_offset, const size_t *dst_origin, const size_t *region, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapBuffer)( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, size_t offset, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clEnqueueMapImage)( cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, const size_t *origin, const size_t *region, size_t *image_row_pitch, size_t *image_slice_pitch, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event, cl_int *errcode_ret)CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueUnmapMemObject)( cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMigrateMemObjects)( cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem *mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clEnqueueMigrateMemObjects; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNDRangeKernel)( cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueTask)( cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueNativeKernel)( cl_command_queue command_queue, void(CL_CALLBACK *user_func)(void *), void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, const void **args_mem_loc, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; #ifdef CL_VERSION_1_2 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarkerWithWaitList)( cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrierWithWaitList)( cl_command_queue command_queue, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY void *( CL_API_CALL *cl_api_clGetExtensionFunctionAddressForPlatform)( cl_platform_id platform, const char *function_name)CL_API_SUFFIX__VERSION_1_2; #else typedef void *cl_api_clEnqueueMarkerWithWaitList; typedef void *cl_api_clEnqueueBarrierWithWaitList; typedef void *cl_api_clGetExtensionFunctionAddressForPlatform; #endif // Shared Virtual Memory APIs #ifdef CL_VERSION_2_0 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMFree)( cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */, void ** /* svm_pointers */, void(CL_CALLBACK *pfn_free_func)(cl_command_queue /* queue */, cl_uint /* num_svm_pointers */, void ** /* svm_pointers[] */, void * /* user_data */), void * /* user_data */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemcpy)( cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, void * /* dst_ptr */, const void * /* src_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMemFill)( cl_command_queue /* command_queue */, void * /* svm_ptr */, const void * /* pattern */, size_t /* pattern_size */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMap)( cl_command_queue /* command_queue */, cl_bool /* blocking_map */, cl_map_flags /* map_flags */, void * /* svm_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMUnmap)( cl_command_queue /* command_queue */, void * /* svm_ptr */, cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; #else typedef void *cl_api_clEnqueueSVMFree; typedef void *cl_api_clEnqueueSVMMemcpy; typedef void *cl_api_clEnqueueSVMMemFill; typedef void *cl_api_clEnqueueSVMMap; typedef void *cl_api_clEnqueueSVMUnmap; #endif // Deprecated APIs typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetCommandQueueProperty)( cl_command_queue command_queue, cl_command_queue_properties properties, cl_bool enable, cl_command_queue_properties *old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage2D)( cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t image_width, size_t image_height, size_t image_row_pitch, void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateImage3D)( cl_context context, cl_mem_flags flags, const cl_image_format *image_format, size_t image_width, size_t image_height, size_t image_depth, size_t image_row_pitch, size_t image_slice_pitch, void *host_ptr, cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clUnloadCompiler)(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueMarker)( cl_command_queue command_queue, cl_event *event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueWaitForEvents)( cl_command_queue command_queue, cl_uint num_events, const cl_event *event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueBarrier)( cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; typedef CL_API_ENTRY void *(CL_API_CALL *cl_api_clGetExtensionFunctionAddress)( const char *function_name)CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; // GL and other APIs typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLBuffer)( cl_context context, cl_mem_flags flags, cl_GLuint bufobj, int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture)( cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture2D)( cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLTexture3D)( cl_context context, cl_mem_flags flags, cl_GLenum target, cl_GLint miplevel, cl_GLuint texture, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromGLRenderbuffer)( cl_context context, cl_mem_flags flags, cl_GLuint renderbuffer, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLObjectInfo)( cl_mem memobj, cl_gl_object_type *gl_object_type, cl_GLuint *gl_object_name) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLTextureInfo)( cl_mem memobj, cl_gl_texture_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireGLObjects)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseGLObjects)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; /* cl_khr_gl_sharing */ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetGLContextInfoKHR)( const cl_context_properties *properties, cl_gl_context_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret); /* cl_khr_gl_event */ typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromGLsyncKHR)( cl_context context, cl_GLsync sync, cl_int *errcode_ret); #if defined(_WIN32) /* cl_khr_d3d10_sharing */ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D10KHR)( cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10BufferKHR)( cl_context context, cl_mem_flags flags, ID3D10Buffer *resource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture2DKHR)( cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D10Texture3DKHR)( cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D10ObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D10ObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_0; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D10KHR( cl_platform_id platform, cl_d3d10_device_source_khr d3d_device_source, void *d3d_object, cl_d3d10_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10BufferKHR(cl_context context, cl_mem_flags flags, ID3D10Buffer *resource, cl_int *errcode_ret); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture2DKHR( cl_context context, cl_mem_flags flags, ID3D10Texture2D *resource, UINT subresource, cl_int *errcode_ret); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D10Texture3DKHR( cl_context context, cl_mem_flags flags, ID3D10Texture3D *resource, UINT subresource, cl_int *errcode_ret); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D10ObjectsKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D10ObjectsKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); /* cl_khr_d3d11_sharing */ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromD3D11KHR)( cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11BufferKHR)( cl_context context, cl_mem_flags flags, ID3D11Buffer *resource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture2DKHR)( cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromD3D11Texture3DKHR)( cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, UINT subresource, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireD3D11ObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseD3D11ObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; /* cl_khr_dx9_media_sharing */ typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR)( cl_platform_id platform, cl_uint num_media_adapters, cl_dx9_media_adapter_type_khr *media_adapters_type, void *media_adapters, cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromDX9MediaSurfaceKHR)( cl_context context, cl_mem_flags flags, cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, cl_uint plane, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_1_2; /* cl_khr_d3d11_sharing */ extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromD3D11KHR( cl_platform_id platform, cl_d3d11_device_source_khr d3d_device_source, void *d3d_object, cl_d3d11_device_set_khr d3d_device_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11BufferKHR(cl_context context, cl_mem_flags flags, ID3D11Buffer *resource, cl_int *errcode_ret); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture2DKHR( cl_context context, cl_mem_flags flags, ID3D11Texture2D *resource, UINT subresource, cl_int *errcode_ret); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromD3D11Texture3DKHR( cl_context context, cl_mem_flags flags, ID3D11Texture3D *resource, UINT subresource, cl_int *errcode_ret); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireD3D11ObjectsKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseD3D11ObjectsKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); /* cl_khr_dx9_media_sharing */ extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromDX9MediaAdapterKHR( cl_platform_id platform, cl_uint num_media_adapters, cl_dx9_media_adapter_type_khr *media_adapter_type, void *media_adapters, cl_dx9_media_adapter_set_khr media_adapter_set, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromDX9MediaSurfaceKHR( cl_context context, cl_mem_flags flags, cl_dx9_media_adapter_type_khr adapter_type, void *surface_info, cl_uint plane, cl_int *errcode_ret); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireDX9MediaSurfacesKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseDX9MediaSurfacesKHR( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); #else /* cl_khr_d3d10_sharing */ typedef void *cl_api_clGetDeviceIDsFromD3D10KHR; typedef void *cl_api_clCreateFromD3D10BufferKHR; typedef void *cl_api_clCreateFromD3D10Texture2DKHR; typedef void *cl_api_clCreateFromD3D10Texture3DKHR; typedef void *cl_api_clEnqueueAcquireD3D10ObjectsKHR; typedef void *cl_api_clEnqueueReleaseD3D10ObjectsKHR; /* cl_khr_d3d11_sharing */ typedef void *cl_api_clGetDeviceIDsFromD3D11KHR; typedef void *cl_api_clCreateFromD3D11BufferKHR; typedef void *cl_api_clCreateFromD3D11Texture2DKHR; typedef void *cl_api_clCreateFromD3D11Texture3DKHR; typedef void *cl_api_clEnqueueAcquireD3D11ObjectsKHR; typedef void *cl_api_clEnqueueReleaseD3D11ObjectsKHR; /* cl_khr_dx9_media_sharing */ typedef void *cl_api_clCreateFromDX9MediaSurfaceKHR; typedef void *cl_api_clEnqueueAcquireDX9MediaSurfacesKHR; typedef void *cl_api_clEnqueueReleaseDX9MediaSurfacesKHR; typedef void *cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR; #endif /* OpenCL 1.1 */ #ifdef CL_VERSION_1_1 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetEventCallback)( cl_event /* event */, cl_int /* command_exec_callback_type */, void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateSubBuffer)( cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */, const void * /* buffer_create_info */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetMemObjectDestructorCallback)( cl_mem /* memobj */, void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, void * /*user_data*/), void * /*user_data */) CL_API_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateUserEvent)( cl_context /* context */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetUserEventStatus)( cl_event /* event */, cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; #else typedef void *cl_api_clSetEventCallback; typedef void *cl_api_clCreateSubBuffer; typedef void *cl_api_clSetMemObjectDestructorCallback; typedef void *cl_api_clCreateUserEvent; typedef void *cl_api_clSetUserEventStatus; #endif typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clCreateSubDevicesEXT)( cl_device_id in_device, const cl_device_partition_property_ext *partition_properties, cl_uint num_entries, cl_device_id *out_devices, cl_uint *num_devices); typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clRetainDeviceEXT)( cl_device_id device) CL_API_SUFFIX__VERSION_1_0; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clReleaseDeviceEXT)( cl_device_id device) CL_API_SUFFIX__VERSION_1_0; /* cl_khr_egl_image */ typedef CL_API_ENTRY cl_mem(CL_API_CALL *cl_api_clCreateFromEGLImageKHR)( cl_context context, CLeglDisplayKHR display, CLeglImageKHR image, cl_mem_flags flags, const cl_egl_image_properties_khr *properties, cl_int *errcode_ret); typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueAcquireEGLObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueReleaseEGLObjectsKHR)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem *mem_objects, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); /* cl_khr_egl_event */ typedef CL_API_ENTRY cl_event(CL_API_CALL *cl_api_clCreateEventFromEGLSyncKHR)( cl_context context, CLeglSyncKHR sync, CLeglDisplayKHR display, cl_int *errcode_ret); #ifdef CL_VERSION_2_1 typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clSetDefaultDeviceCommandQueue)( cl_context context, cl_device_id device, cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_program(CL_API_CALL *cl_api_clCreateProgramWithIL)( cl_context context, const void *il, size_t length, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetKernelSubGroupInfo)( cl_kernel kernel, cl_device_id device, cl_kernel_sub_group_info param_name, size_t input_value_size, const void *input_value, size_t param_value_size, void *param_value, size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_kernel(CL_API_CALL *cl_api_clCloneKernel)( cl_kernel source_kernel, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clEnqueueSVMMigrateMem)( cl_command_queue command_queue, cl_uint num_svm_pointers, const void **svm_pointers, const size_t *sizes, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetDeviceAndHostTimer)( cl_device_id device, cl_ulong *device_timestamp, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; typedef CL_API_ENTRY cl_int(CL_API_CALL *cl_api_clGetHostTimer)( cl_device_id device, cl_ulong *host_timestamp) CL_API_SUFFIX__VERSION_2_1; #else typedef void *cl_api_clSetDefaultDeviceCommandQueue; typedef void *cl_api_clCreateProgramWithIL; typedef void *cl_api_clGetKernelSubGroupInfo; typedef void *cl_api_clCloneKernel; typedef void *cl_api_clEnqueueSVMMigrateMem; typedef void *cl_api_clGetDeviceAndHostTimer; typedef void *cl_api_clGetHostTimer; #endif /* Vendor dispatch table struture */ typedef struct _cl_icd_dispatch { /* OpenCL 1.0 */ cl_api_clGetPlatformIDs clGetPlatformIDs; cl_api_clGetPlatformInfo clGetPlatformInfo; cl_api_clGetDeviceIDs clGetDeviceIDs; cl_api_clGetDeviceInfo clGetDeviceInfo; cl_api_clCreateContext clCreateContext; cl_api_clCreateContextFromType clCreateContextFromType; cl_api_clRetainContext clRetainContext; cl_api_clReleaseContext clReleaseContext; cl_api_clGetContextInfo clGetContextInfo; cl_api_clCreateCommandQueue clCreateCommandQueue; cl_api_clRetainCommandQueue clRetainCommandQueue; cl_api_clReleaseCommandQueue clReleaseCommandQueue; cl_api_clGetCommandQueueInfo clGetCommandQueueInfo; cl_api_clSetCommandQueueProperty clSetCommandQueueProperty; cl_api_clCreateBuffer clCreateBuffer; cl_api_clCreateImage2D clCreateImage2D; cl_api_clCreateImage3D clCreateImage3D; cl_api_clRetainMemObject clRetainMemObject; cl_api_clReleaseMemObject clReleaseMemObject; cl_api_clGetSupportedImageFormats clGetSupportedImageFormats; cl_api_clGetMemObjectInfo clGetMemObjectInfo; cl_api_clGetImageInfo clGetImageInfo; cl_api_clCreateSampler clCreateSampler; cl_api_clRetainSampler clRetainSampler; cl_api_clReleaseSampler clReleaseSampler; cl_api_clGetSamplerInfo clGetSamplerInfo; cl_api_clCreateProgramWithSource clCreateProgramWithSource; cl_api_clCreateProgramWithBinary clCreateProgramWithBinary; cl_api_clRetainProgram clRetainProgram; cl_api_clReleaseProgram clReleaseProgram; cl_api_clBuildProgram clBuildProgram; cl_api_clUnloadCompiler clUnloadCompiler; cl_api_clGetProgramInfo clGetProgramInfo; cl_api_clGetProgramBuildInfo clGetProgramBuildInfo; cl_api_clCreateKernel clCreateKernel; cl_api_clCreateKernelsInProgram clCreateKernelsInProgram; cl_api_clRetainKernel clRetainKernel; cl_api_clReleaseKernel clReleaseKernel; cl_api_clSetKernelArg clSetKernelArg; cl_api_clGetKernelInfo clGetKernelInfo; cl_api_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; cl_api_clWaitForEvents clWaitForEvents; cl_api_clGetEventInfo clGetEventInfo; cl_api_clRetainEvent clRetainEvent; cl_api_clReleaseEvent clReleaseEvent; cl_api_clGetEventProfilingInfo clGetEventProfilingInfo; cl_api_clFlush clFlush; cl_api_clFinish clFinish; cl_api_clEnqueueReadBuffer clEnqueueReadBuffer; cl_api_clEnqueueWriteBuffer clEnqueueWriteBuffer; cl_api_clEnqueueCopyBuffer clEnqueueCopyBuffer; cl_api_clEnqueueReadImage clEnqueueReadImage; cl_api_clEnqueueWriteImage clEnqueueWriteImage; cl_api_clEnqueueCopyImage clEnqueueCopyImage; cl_api_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; cl_api_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; cl_api_clEnqueueMapBuffer clEnqueueMapBuffer; cl_api_clEnqueueMapImage clEnqueueMapImage; cl_api_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; cl_api_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; cl_api_clEnqueueTask clEnqueueTask; cl_api_clEnqueueNativeKernel clEnqueueNativeKernel; cl_api_clEnqueueMarker clEnqueueMarker; cl_api_clEnqueueWaitForEvents clEnqueueWaitForEvents; cl_api_clEnqueueBarrier clEnqueueBarrier; cl_api_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; cl_api_clCreateFromGLBuffer clCreateFromGLBuffer; cl_api_clCreateFromGLTexture2D clCreateFromGLTexture2D; cl_api_clCreateFromGLTexture3D clCreateFromGLTexture3D; cl_api_clCreateFromGLRenderbuffer clCreateFromGLRenderbuffer; cl_api_clGetGLObjectInfo clGetGLObjectInfo; cl_api_clGetGLTextureInfo clGetGLTextureInfo; cl_api_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; cl_api_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; cl_api_clGetGLContextInfoKHR clGetGLContextInfoKHR; /* cl_khr_d3d10_sharing */ cl_api_clGetDeviceIDsFromD3D10KHR clGetDeviceIDsFromD3D10KHR; cl_api_clCreateFromD3D10BufferKHR clCreateFromD3D10BufferKHR; cl_api_clCreateFromD3D10Texture2DKHR clCreateFromD3D10Texture2DKHR; cl_api_clCreateFromD3D10Texture3DKHR clCreateFromD3D10Texture3DKHR; cl_api_clEnqueueAcquireD3D10ObjectsKHR clEnqueueAcquireD3D10ObjectsKHR; cl_api_clEnqueueReleaseD3D10ObjectsKHR clEnqueueReleaseD3D10ObjectsKHR; /* OpenCL 1.1 */ cl_api_clSetEventCallback clSetEventCallback; cl_api_clCreateSubBuffer clCreateSubBuffer; cl_api_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; cl_api_clCreateUserEvent clCreateUserEvent; cl_api_clSetUserEventStatus clSetUserEventStatus; cl_api_clEnqueueReadBufferRect clEnqueueReadBufferRect; cl_api_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; cl_api_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; /* cl_ext_device_fission */ cl_api_clCreateSubDevicesEXT clCreateSubDevicesEXT; cl_api_clRetainDeviceEXT clRetainDeviceEXT; cl_api_clReleaseDeviceEXT clReleaseDeviceEXT; /* cl_khr_gl_event */ cl_api_clCreateEventFromGLsyncKHR clCreateEventFromGLsyncKHR; /* OpenCL 1.2 */ cl_api_clCreateSubDevices clCreateSubDevices; cl_api_clRetainDevice clRetainDevice; cl_api_clReleaseDevice clReleaseDevice; cl_api_clCreateImage clCreateImage; cl_api_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; cl_api_clCompileProgram clCompileProgram; cl_api_clLinkProgram clLinkProgram; cl_api_clUnloadPlatformCompiler clUnloadPlatformCompiler; cl_api_clGetKernelArgInfo clGetKernelArgInfo; cl_api_clEnqueueFillBuffer clEnqueueFillBuffer; cl_api_clEnqueueFillImage clEnqueueFillImage; cl_api_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; cl_api_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; cl_api_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; cl_api_clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform; cl_api_clCreateFromGLTexture clCreateFromGLTexture; /* cl_khr_d3d11_sharing */ cl_api_clGetDeviceIDsFromD3D11KHR clGetDeviceIDsFromD3D11KHR; cl_api_clCreateFromD3D11BufferKHR clCreateFromD3D11BufferKHR; cl_api_clCreateFromD3D11Texture2DKHR clCreateFromD3D11Texture2DKHR; cl_api_clCreateFromD3D11Texture3DKHR clCreateFromD3D11Texture3DKHR; cl_api_clCreateFromDX9MediaSurfaceKHR clCreateFromDX9MediaSurfaceKHR; cl_api_clEnqueueAcquireD3D11ObjectsKHR clEnqueueAcquireD3D11ObjectsKHR; cl_api_clEnqueueReleaseD3D11ObjectsKHR clEnqueueReleaseD3D11ObjectsKHR; /* cl_khr_dx9_media_sharing */ cl_api_clGetDeviceIDsFromDX9MediaAdapterKHR clGetDeviceIDsFromDX9MediaAdapterKHR; cl_api_clEnqueueAcquireDX9MediaSurfacesKHR clEnqueueAcquireDX9MediaSurfacesKHR; cl_api_clEnqueueReleaseDX9MediaSurfacesKHR clEnqueueReleaseDX9MediaSurfacesKHR; /* cl_khr_egl_image */ cl_api_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; cl_api_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; cl_api_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; /* cl_khr_egl_event */ cl_api_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; /* OpenCL 2.0 */ cl_api_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; cl_api_clCreatePipe clCreatePipe; cl_api_clGetPipeInfo clGetPipeInfo; cl_api_clSVMAlloc clSVMAlloc; cl_api_clSVMFree clSVMFree; cl_api_clEnqueueSVMFree clEnqueueSVMFree; cl_api_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; cl_api_clEnqueueSVMMemFill clEnqueueSVMMemFill; cl_api_clEnqueueSVMMap clEnqueueSVMMap; cl_api_clEnqueueSVMUnmap clEnqueueSVMUnmap; cl_api_clCreateSamplerWithProperties clCreateSamplerWithProperties; cl_api_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; cl_api_clSetKernelExecInfo clSetKernelExecInfo; /* cl_khr_sub_groups */ cl_api_clGetKernelSubGroupInfoKHR clGetKernelSubGroupInfoKHR; /* OpenCL 2.1 */ cl_api_clCloneKernel clCloneKernel; cl_api_clCreateProgramWithIL clCreateProgramWithIL; cl_api_clEnqueueSVMMigrateMem clEnqueueSVMMigrateMem; cl_api_clGetDeviceAndHostTimer clGetDeviceAndHostTimer; cl_api_clGetHostTimer clGetHostTimer; cl_api_clGetKernelSubGroupInfo clGetKernelSubGroupInfo; cl_api_clSetDefaultDeviceCommandQueue clSetDefaultDeviceCommandQueue; /* OpenCL 2.2 */ cl_api_clSetProgramReleaseCallback clSetProgramReleaseCallback; cl_api_clSetProgramSpecializationConstant clSetProgramSpecializationConstant; /* OpenCL 3.0 */ cl_api_clCreateBufferWithProperties clCreateBufferWithProperties; cl_api_clCreateImageWithProperties clCreateImageWithProperties; } cl_icd_dispatch; #ifdef __cplusplus } #endif #endif /* #ifndef OPENCL_CL_ICD_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_platform.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __CL_PLATFORM_H #define __CL_PLATFORM_H #include #ifdef __cplusplus extern "C" { #endif #if defined(_WIN32) #define CL_API_ENTRY #define CL_API_CALL __stdcall #define CL_CALLBACK __stdcall #else #define CL_API_ENTRY #define CL_API_CALL #define CL_CALLBACK #endif /* * Deprecation flags refer to the last version of the header in which the * feature was not deprecated. * * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without * deprecation but is deprecated in versions later than 1.1. */ #define CL_EXTENSION_WEAK_LINK #define CL_API_SUFFIX__VERSION_1_0 #define CL_EXT_SUFFIX__VERSION_1_0 #define CL_API_SUFFIX__VERSION_1_1 #define CL_EXT_SUFFIX__VERSION_1_1 #define CL_API_SUFFIX__VERSION_1_2 #define CL_EXT_SUFFIX__VERSION_1_2 #define CL_API_SUFFIX__VERSION_2_0 #define CL_EXT_SUFFIX__VERSION_2_0 #define CL_API_SUFFIX__VERSION_2_1 #define CL_EXT_SUFFIX__VERSION_2_1 #define CL_API_SUFFIX__VERSION_2_2 #define CL_EXT_SUFFIX__VERSION_2_2 #define CL_API_SUFFIX__VERSION_3_0 #define CL_EXT_SUFFIX__VERSION_3_0 #define CL_API_SUFFIX__EXPERIMENTAL #define CL_EXT_SUFFIX__EXPERIMENTAL #ifdef __GNUC__ #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated)) #define CL_EXT_PREFIX_DEPRECATED #elif defined(_WIN32) #define CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated) #else #define CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED #else #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED #endif #if (defined (_WIN32) && defined(_MSC_VER)) /* scalar types */ typedef signed __int8 cl_char; typedef unsigned __int8 cl_uchar; typedef signed __int16 cl_short; typedef unsigned __int16 cl_ushort; typedef signed __int32 cl_int; typedef unsigned __int32 cl_uint; typedef signed __int64 cl_long; typedef unsigned __int64 cl_ulong; typedef unsigned __int16 cl_half; typedef float cl_float; typedef double cl_double; /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 #define CL_SCHAR_MAX 127 #define CL_SCHAR_MIN (-127-1) #define CL_CHAR_MAX CL_SCHAR_MAX #define CL_CHAR_MIN CL_SCHAR_MIN #define CL_UCHAR_MAX 255 #define CL_SHRT_MAX 32767 #define CL_SHRT_MIN (-32767-1) #define CL_USHRT_MAX 65535 #define CL_INT_MAX 2147483647 #define CL_INT_MIN (-2147483647-1) #define CL_UINT_MAX 0xffffffffU #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) #define CL_FLT_DIG 6 #define CL_FLT_MANT_DIG 24 #define CL_FLT_MAX_10_EXP +38 #define CL_FLT_MAX_EXP +128 #define CL_FLT_MIN_10_EXP -37 #define CL_FLT_MIN_EXP -125 #define CL_FLT_RADIX 2 #define CL_FLT_MAX 340282346638528859811704183484516925440.0f #define CL_FLT_MIN 1.175494350822287507969e-38f #define CL_FLT_EPSILON 1.1920928955078125e-7f #define CL_HALF_DIG 3 #define CL_HALF_MANT_DIG 11 #define CL_HALF_MAX_10_EXP +4 #define CL_HALF_MAX_EXP +16 #define CL_HALF_MIN_10_EXP -4 #define CL_HALF_MIN_EXP -13 #define CL_HALF_RADIX 2 #define CL_HALF_MAX 65504.0f #define CL_HALF_MIN 6.103515625e-05f #define CL_HALF_EPSILON 9.765625e-04f #define CL_DBL_DIG 15 #define CL_DBL_MANT_DIG 53 #define CL_DBL_MAX_10_EXP +308 #define CL_DBL_MAX_EXP +1024 #define CL_DBL_MIN_10_EXP -307 #define CL_DBL_MIN_EXP -1021 #define CL_DBL_RADIX 2 #define CL_DBL_MAX 1.7976931348623158e+308 #define CL_DBL_MIN 2.225073858507201383090e-308 #define CL_DBL_EPSILON 2.220446049250313080847e-16 #define CL_M_E 2.7182818284590452354 #define CL_M_LOG2E 1.4426950408889634074 #define CL_M_LOG10E 0.43429448190325182765 #define CL_M_LN2 0.69314718055994530942 #define CL_M_LN10 2.30258509299404568402 #define CL_M_PI 3.14159265358979323846 #define CL_M_PI_2 1.57079632679489661923 #define CL_M_PI_4 0.78539816339744830962 #define CL_M_1_PI 0.31830988618379067154 #define CL_M_2_PI 0.63661977236758134308 #define CL_M_2_SQRTPI 1.12837916709551257390 #define CL_M_SQRT2 1.41421356237309504880 #define CL_M_SQRT1_2 0.70710678118654752440 #define CL_M_E_F 2.718281828f #define CL_M_LOG2E_F 1.442695041f #define CL_M_LOG10E_F 0.434294482f #define CL_M_LN2_F 0.693147181f #define CL_M_LN10_F 2.302585093f #define CL_M_PI_F 3.141592654f #define CL_M_PI_2_F 1.570796327f #define CL_M_PI_4_F 0.785398163f #define CL_M_1_PI_F 0.318309886f #define CL_M_2_PI_F 0.636619772f #define CL_M_2_SQRTPI_F 1.128379167f #define CL_M_SQRT2_F 1.414213562f #define CL_M_SQRT1_2_F 0.707106781f #define CL_NAN (CL_INFINITY - CL_INFINITY) #define CL_HUGE_VALF ((cl_float) 1e50) #define CL_HUGE_VAL ((cl_double) 1e500) #define CL_MAXFLOAT CL_FLT_MAX #define CL_INFINITY CL_HUGE_VALF #else #include /* scalar types */ typedef int8_t cl_char; typedef uint8_t cl_uchar; typedef int16_t cl_short; typedef uint16_t cl_ushort; typedef int32_t cl_int; typedef uint32_t cl_uint; typedef int64_t cl_long; typedef uint64_t cl_ulong; typedef uint16_t cl_half; typedef float cl_float; typedef double cl_double; /* Macro names and corresponding values defined by OpenCL */ #define CL_CHAR_BIT 8 #define CL_SCHAR_MAX 127 #define CL_SCHAR_MIN (-127-1) #define CL_CHAR_MAX CL_SCHAR_MAX #define CL_CHAR_MIN CL_SCHAR_MIN #define CL_UCHAR_MAX 255 #define CL_SHRT_MAX 32767 #define CL_SHRT_MIN (-32767-1) #define CL_USHRT_MAX 65535 #define CL_INT_MAX 2147483647 #define CL_INT_MIN (-2147483647-1) #define CL_UINT_MAX 0xffffffffU #define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) #define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) #define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) #define CL_FLT_DIG 6 #define CL_FLT_MANT_DIG 24 #define CL_FLT_MAX_10_EXP +38 #define CL_FLT_MAX_EXP +128 #define CL_FLT_MIN_10_EXP -37 #define CL_FLT_MIN_EXP -125 #define CL_FLT_RADIX 2 #define CL_FLT_MAX 340282346638528859811704183484516925440.0f #define CL_FLT_MIN 1.175494350822287507969e-38f #define CL_FLT_EPSILON 1.1920928955078125e-7f #define CL_HALF_DIG 3 #define CL_HALF_MANT_DIG 11 #define CL_HALF_MAX_10_EXP +4 #define CL_HALF_MAX_EXP +16 #define CL_HALF_MIN_10_EXP -4 #define CL_HALF_MIN_EXP -13 #define CL_HALF_RADIX 2 #define CL_HALF_MAX 65504.0f #define CL_HALF_MIN 6.103515625e-05f #define CL_HALF_EPSILON 9.765625e-04f #define CL_DBL_DIG 15 #define CL_DBL_MANT_DIG 53 #define CL_DBL_MAX_10_EXP +308 #define CL_DBL_MAX_EXP +1024 #define CL_DBL_MIN_10_EXP -307 #define CL_DBL_MIN_EXP -1021 #define CL_DBL_RADIX 2 #define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 #define CL_DBL_MIN 2.225073858507201383090e-308 #define CL_DBL_EPSILON 2.220446049250313080847e-16 #define CL_M_E 2.7182818284590452354 #define CL_M_LOG2E 1.4426950408889634074 #define CL_M_LOG10E 0.43429448190325182765 #define CL_M_LN2 0.69314718055994530942 #define CL_M_LN10 2.30258509299404568402 #define CL_M_PI 3.14159265358979323846 #define CL_M_PI_2 1.57079632679489661923 #define CL_M_PI_4 0.78539816339744830962 #define CL_M_1_PI 0.31830988618379067154 #define CL_M_2_PI 0.63661977236758134308 #define CL_M_2_SQRTPI 1.12837916709551257390 #define CL_M_SQRT2 1.41421356237309504880 #define CL_M_SQRT1_2 0.70710678118654752440 #define CL_M_E_F 2.718281828f #define CL_M_LOG2E_F 1.442695041f #define CL_M_LOG10E_F 0.434294482f #define CL_M_LN2_F 0.693147181f #define CL_M_LN10_F 2.302585093f #define CL_M_PI_F 3.141592654f #define CL_M_PI_2_F 1.570796327f #define CL_M_PI_4_F 0.785398163f #define CL_M_1_PI_F 0.318309886f #define CL_M_2_PI_F 0.636619772f #define CL_M_2_SQRTPI_F 1.128379167f #define CL_M_SQRT2_F 1.414213562f #define CL_M_SQRT1_2_F 0.707106781f #if defined( __GNUC__ ) #define CL_HUGE_VALF __builtin_huge_valf() #define CL_HUGE_VAL __builtin_huge_val() #define CL_NAN __builtin_nanf( "" ) #else #define CL_HUGE_VALF ((cl_float) 1e50) #define CL_HUGE_VAL ((cl_double) 1e500) float nanf( const char * ); #define CL_NAN nanf( "" ) #endif #define CL_MAXFLOAT CL_FLT_MAX #define CL_INFINITY CL_HUGE_VALF #endif #include /* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ typedef unsigned int cl_GLuint; typedef int cl_GLint; typedef unsigned int cl_GLenum; /* * Vector types * * Note: OpenCL requires that all types be naturally aligned. * This means that vector types must be naturally aligned. * For example, a vector of four floats must be aligned to * a 16 byte boundary (calculated as 4 * the natural 4-byte * alignment of the float). The alignment qualifiers here * will only function properly if your compiler supports them * and if you don't actively work to defeat them. For example, * in order for a cl_float4 to be 16 byte aligned in a struct, * the start of the struct must itself be 16-byte aligned. * * Maintaining proper alignment is the user's responsibility. */ /* Define basic vector types */ #if defined( __VEC__ ) #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ typedef __vector unsigned char __cl_uchar16; typedef __vector signed char __cl_char16; typedef __vector unsigned short __cl_ushort8; typedef __vector signed short __cl_short8; typedef __vector unsigned int __cl_uint4; typedef __vector signed int __cl_int4; typedef __vector float __cl_float4; #define __CL_UCHAR16__ 1 #define __CL_CHAR16__ 1 #define __CL_USHORT8__ 1 #define __CL_SHORT8__ 1 #define __CL_UINT4__ 1 #define __CL_INT4__ 1 #define __CL_FLOAT4__ 1 #endif #if defined( __SSE__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef float __cl_float4 __attribute__((vector_size(16))); #else typedef __m128 __cl_float4; #endif #define __CL_FLOAT4__ 1 #endif #if defined( __SSE2__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); typedef cl_char __cl_char16 __attribute__((vector_size(16))); typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); typedef cl_short __cl_short8 __attribute__((vector_size(16))); typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); typedef cl_int __cl_int4 __attribute__((vector_size(16))); typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); typedef cl_long __cl_long2 __attribute__((vector_size(16))); typedef cl_double __cl_double2 __attribute__((vector_size(16))); #else typedef __m128i __cl_uchar16; typedef __m128i __cl_char16; typedef __m128i __cl_ushort8; typedef __m128i __cl_short8; typedef __m128i __cl_uint4; typedef __m128i __cl_int4; typedef __m128i __cl_ulong2; typedef __m128i __cl_long2; typedef __m128d __cl_double2; #endif #define __CL_UCHAR16__ 1 #define __CL_CHAR16__ 1 #define __CL_USHORT8__ 1 #define __CL_SHORT8__ 1 #define __CL_INT4__ 1 #define __CL_UINT4__ 1 #define __CL_ULONG2__ 1 #define __CL_LONG2__ 1 #define __CL_DOUBLE2__ 1 #endif #if defined( __MMX__ ) #include #if defined( __GNUC__ ) typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); typedef cl_char __cl_char8 __attribute__((vector_size(8))); typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); typedef cl_short __cl_short4 __attribute__((vector_size(8))); typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); typedef cl_int __cl_int2 __attribute__((vector_size(8))); typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); typedef cl_long __cl_long1 __attribute__((vector_size(8))); typedef cl_float __cl_float2 __attribute__((vector_size(8))); #else typedef __m64 __cl_uchar8; typedef __m64 __cl_char8; typedef __m64 __cl_ushort4; typedef __m64 __cl_short4; typedef __m64 __cl_uint2; typedef __m64 __cl_int2; typedef __m64 __cl_ulong1; typedef __m64 __cl_long1; typedef __m64 __cl_float2; #endif #define __CL_UCHAR8__ 1 #define __CL_CHAR8__ 1 #define __CL_USHORT4__ 1 #define __CL_SHORT4__ 1 #define __CL_INT2__ 1 #define __CL_UINT2__ 1 #define __CL_ULONG1__ 1 #define __CL_LONG1__ 1 #define __CL_FLOAT2__ 1 #endif #if defined( __AVX__ ) #if defined( __MINGW64__ ) #include #else #include #endif #if defined( __GNUC__ ) typedef cl_float __cl_float8 __attribute__((vector_size(32))); typedef cl_double __cl_double4 __attribute__((vector_size(32))); #else typedef __m256 __cl_float8; typedef __m256d __cl_double4; #endif #define __CL_FLOAT8__ 1 #define __CL_DOUBLE4__ 1 #endif /* Define capabilities for anonymous struct members. */ #if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L #define __CL_HAS_ANON_STRUCT__ 1 #define __CL_ANON_STRUCT__ #elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) #define __CL_HAS_ANON_STRUCT__ 1 #define __CL_ANON_STRUCT__ __extension__ #elif defined( _WIN32) && defined(_MSC_VER) #if _MSC_VER >= 1500 /* Microsoft Developer Studio 2008 supports anonymous structs, but * complains by default. */ #define __CL_HAS_ANON_STRUCT__ 1 #define __CL_ANON_STRUCT__ /* Disable warning C4201: nonstandard extension used : nameless * struct/union */ #pragma warning( push ) #pragma warning( disable : 4201 ) #endif #else #define __CL_HAS_ANON_STRUCT__ 0 #define __CL_ANON_STRUCT__ #endif /* Define alignment keys */ #if defined( __GNUC__ ) #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) #elif defined( _WIN32) && (_MSC_VER) /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ /* #include */ /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ #define CL_ALIGNED(_x) #else #warning Need to implement some method to align data here #define CL_ALIGNED(_x) #endif /* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ #if __CL_HAS_ANON_STRUCT__ /* .xyzw and .s0123...{f|F} are supported */ #define CL_HAS_NAMED_VECTOR_FIELDS 1 /* .hi and .lo are supported */ #define CL_HAS_HI_LO_VECTOR_FIELDS 1 #endif /* Define cl_vector types */ /* ---- cl_charn ---- */ typedef union { cl_char CL_ALIGNED(2) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_char x, y; }; __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2; #endif }cl_char2; typedef union { cl_char CL_ALIGNED(4) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[2]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4; #endif }cl_char4; /* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ typedef cl_char4 cl_char3; typedef union { cl_char CL_ALIGNED(8) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[4]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4[2]; #endif #if defined( __CL_CHAR8__ ) __cl_char8 v8; #endif }cl_char8; typedef union { cl_char CL_ALIGNED(16) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; #endif #if defined( __CL_CHAR2__) __cl_char2 v2[8]; #endif #if defined( __CL_CHAR4__) __cl_char4 v4[4]; #endif #if defined( __CL_CHAR8__ ) __cl_char8 v8[2]; #endif #if defined( __CL_CHAR16__ ) __cl_char16 v16; #endif }cl_char16; /* ---- cl_ucharn ---- */ typedef union { cl_uchar CL_ALIGNED(2) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; #endif #if defined( __cl_uchar2__) __cl_uchar2 v2; #endif }cl_uchar2; typedef union { cl_uchar CL_ALIGNED(4) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[2]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4; #endif }cl_uchar4; /* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ typedef cl_uchar4 cl_uchar3; typedef union { cl_uchar CL_ALIGNED(8) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[4]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4[2]; #endif #if defined( __CL_UCHAR8__ ) __cl_uchar8 v8; #endif }cl_uchar8; typedef union { cl_uchar CL_ALIGNED(16) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; #endif #if defined( __CL_UCHAR2__) __cl_uchar2 v2[8]; #endif #if defined( __CL_UCHAR4__) __cl_uchar4 v4[4]; #endif #if defined( __CL_UCHAR8__ ) __cl_uchar8 v8[2]; #endif #if defined( __CL_UCHAR16__ ) __cl_uchar16 v16; #endif }cl_uchar16; /* ---- cl_shortn ---- */ typedef union { cl_short CL_ALIGNED(4) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_short x, y; }; __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2; #endif }cl_short2; typedef union { cl_short CL_ALIGNED(8) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[2]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4; #endif }cl_short4; /* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ typedef cl_short4 cl_short3; typedef union { cl_short CL_ALIGNED(16) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[4]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4[2]; #endif #if defined( __CL_SHORT8__ ) __cl_short8 v8; #endif }cl_short8; typedef union { cl_short CL_ALIGNED(32) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; #endif #if defined( __CL_SHORT2__) __cl_short2 v2[8]; #endif #if defined( __CL_SHORT4__) __cl_short4 v4[4]; #endif #if defined( __CL_SHORT8__ ) __cl_short8 v8[2]; #endif #if defined( __CL_SHORT16__ ) __cl_short16 v16; #endif }cl_short16; /* ---- cl_ushortn ---- */ typedef union { cl_ushort CL_ALIGNED(4) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2; #endif }cl_ushort2; typedef union { cl_ushort CL_ALIGNED(8) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[2]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4; #endif }cl_ushort4; /* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ typedef cl_ushort4 cl_ushort3; typedef union { cl_ushort CL_ALIGNED(16) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[4]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4[2]; #endif #if defined( __CL_USHORT8__ ) __cl_ushort8 v8; #endif }cl_ushort8; typedef union { cl_ushort CL_ALIGNED(32) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; #endif #if defined( __CL_USHORT2__) __cl_ushort2 v2[8]; #endif #if defined( __CL_USHORT4__) __cl_ushort4 v4[4]; #endif #if defined( __CL_USHORT8__ ) __cl_ushort8 v8[2]; #endif #if defined( __CL_USHORT16__ ) __cl_ushort16 v16; #endif }cl_ushort16; /* ---- cl_halfn ---- */ typedef union { cl_half CL_ALIGNED(4) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_half x, y; }; __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; #endif #if defined( __CL_HALF2__) __cl_half2 v2; #endif }cl_half2; typedef union { cl_half CL_ALIGNED(8) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; #endif #if defined( __CL_HALF2__) __cl_half2 v2[2]; #endif #if defined( __CL_HALF4__) __cl_half4 v4; #endif }cl_half4; /* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ typedef cl_half4 cl_half3; typedef union { cl_half CL_ALIGNED(16) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; #endif #if defined( __CL_HALF2__) __cl_half2 v2[4]; #endif #if defined( __CL_HALF4__) __cl_half4 v4[2]; #endif #if defined( __CL_HALF8__ ) __cl_half8 v8; #endif }cl_half8; typedef union { cl_half CL_ALIGNED(32) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; #endif #if defined( __CL_HALF2__) __cl_half2 v2[8]; #endif #if defined( __CL_HALF4__) __cl_half4 v4[4]; #endif #if defined( __CL_HALF8__ ) __cl_half8 v8[2]; #endif #if defined( __CL_HALF16__ ) __cl_half16 v16; #endif }cl_half16; /* ---- cl_intn ---- */ typedef union { cl_int CL_ALIGNED(8) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_int x, y; }; __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2; #endif }cl_int2; typedef union { cl_int CL_ALIGNED(16) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[2]; #endif #if defined( __CL_INT4__) __cl_int4 v4; #endif }cl_int4; /* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ typedef cl_int4 cl_int3; typedef union { cl_int CL_ALIGNED(32) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[4]; #endif #if defined( __CL_INT4__) __cl_int4 v4[2]; #endif #if defined( __CL_INT8__ ) __cl_int8 v8; #endif }cl_int8; typedef union { cl_int CL_ALIGNED(64) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; #endif #if defined( __CL_INT2__) __cl_int2 v2[8]; #endif #if defined( __CL_INT4__) __cl_int4 v4[4]; #endif #if defined( __CL_INT8__ ) __cl_int8 v8[2]; #endif #if defined( __CL_INT16__ ) __cl_int16 v16; #endif }cl_int16; /* ---- cl_uintn ---- */ typedef union { cl_uint CL_ALIGNED(8) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2; #endif }cl_uint2; typedef union { cl_uint CL_ALIGNED(16) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[2]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4; #endif }cl_uint4; /* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ typedef cl_uint4 cl_uint3; typedef union { cl_uint CL_ALIGNED(32) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[4]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4[2]; #endif #if defined( __CL_UINT8__ ) __cl_uint8 v8; #endif }cl_uint8; typedef union { cl_uint CL_ALIGNED(64) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; #endif #if defined( __CL_UINT2__) __cl_uint2 v2[8]; #endif #if defined( __CL_UINT4__) __cl_uint4 v4[4]; #endif #if defined( __CL_UINT8__ ) __cl_uint8 v8[2]; #endif #if defined( __CL_UINT16__ ) __cl_uint16 v16; #endif }cl_uint16; /* ---- cl_longn ---- */ typedef union { cl_long CL_ALIGNED(16) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_long x, y; }; __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2; #endif }cl_long2; typedef union { cl_long CL_ALIGNED(32) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[2]; #endif #if defined( __CL_LONG4__) __cl_long4 v4; #endif }cl_long4; /* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ typedef cl_long4 cl_long3; typedef union { cl_long CL_ALIGNED(64) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[4]; #endif #if defined( __CL_LONG4__) __cl_long4 v4[2]; #endif #if defined( __CL_LONG8__ ) __cl_long8 v8; #endif }cl_long8; typedef union { cl_long CL_ALIGNED(128) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; #endif #if defined( __CL_LONG2__) __cl_long2 v2[8]; #endif #if defined( __CL_LONG4__) __cl_long4 v4[4]; #endif #if defined( __CL_LONG8__ ) __cl_long8 v8[2]; #endif #if defined( __CL_LONG16__ ) __cl_long16 v16; #endif }cl_long16; /* ---- cl_ulongn ---- */ typedef union { cl_ulong CL_ALIGNED(16) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2; #endif }cl_ulong2; typedef union { cl_ulong CL_ALIGNED(32) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[2]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4; #endif }cl_ulong4; /* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ typedef cl_ulong4 cl_ulong3; typedef union { cl_ulong CL_ALIGNED(64) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[4]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4[2]; #endif #if defined( __CL_ULONG8__ ) __cl_ulong8 v8; #endif }cl_ulong8; typedef union { cl_ulong CL_ALIGNED(128) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; #endif #if defined( __CL_ULONG2__) __cl_ulong2 v2[8]; #endif #if defined( __CL_ULONG4__) __cl_ulong4 v4[4]; #endif #if defined( __CL_ULONG8__ ) __cl_ulong8 v8[2]; #endif #if defined( __CL_ULONG16__ ) __cl_ulong16 v16; #endif }cl_ulong16; /* --- cl_floatn ---- */ typedef union { cl_float CL_ALIGNED(8) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_float x, y; }; __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2; #endif }cl_float2; typedef union { cl_float CL_ALIGNED(16) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[2]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4; #endif }cl_float4; /* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ typedef cl_float4 cl_float3; typedef union { cl_float CL_ALIGNED(32) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[4]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4[2]; #endif #if defined( __CL_FLOAT8__ ) __cl_float8 v8; #endif }cl_float8; typedef union { cl_float CL_ALIGNED(64) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; #endif #if defined( __CL_FLOAT2__) __cl_float2 v2[8]; #endif #if defined( __CL_FLOAT4__) __cl_float4 v4[4]; #endif #if defined( __CL_FLOAT8__ ) __cl_float8 v8[2]; #endif #if defined( __CL_FLOAT16__ ) __cl_float16 v16; #endif }cl_float16; /* --- cl_doublen ---- */ typedef union { cl_double CL_ALIGNED(16) s[2]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_double x, y; }; __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2; #endif }cl_double2; typedef union { cl_double CL_ALIGNED(32) s[4]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[2]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4; #endif }cl_double4; /* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ typedef cl_double4 cl_double3; typedef union { cl_double CL_ALIGNED(64) s[8]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[4]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4[2]; #endif #if defined( __CL_DOUBLE8__ ) __cl_double8 v8; #endif }cl_double8; typedef union { cl_double CL_ALIGNED(128) s[16]; #if __CL_HAS_ANON_STRUCT__ __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; #endif #if defined( __CL_DOUBLE2__) __cl_double2 v2[8]; #endif #if defined( __CL_DOUBLE4__) __cl_double4 v4[4]; #endif #if defined( __CL_DOUBLE8__ ) __cl_double8 v8[2]; #endif #if defined( __CL_DOUBLE16__ ) __cl_double16 v16; #endif }cl_double16; /* Macro to facilitate debugging * Usage: * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" * Each line thereafter of OpenCL C source must end with: \n\ * The last line ends in "; * * Example: * * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ * kernel void foo( int a, float * b ) \n\ * { \n\ * // my comment \n\ * *b[ get_global_id(0)] = a; \n\ * } \n\ * "; * * This should correctly set up the line, (column) and file information for your source * string so you can do source level debugging. */ #define __CL_STRINGIFY( _x ) # _x #define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) #define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" #ifdef __cplusplus } #endif #undef __CL_HAS_ANON_STRUCT__ #undef __CL_ANON_STRUCT__ #if defined( _WIN32) && defined(_MSC_VER) #if _MSC_VER >=1500 #pragma warning( pop ) #endif #endif #endif /* __CL_PLATFORM_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_va_api_media_sharing_intel.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ /*****************************************************************************\ Copyright (c) 2013-2019 Intel Corporation All Rights Reserved. THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. File Name: cl_va_api_media_sharing_intel.h Abstract: Notes: \*****************************************************************************/ #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H #include #include #include #ifdef __cplusplus extern "C" { #endif /****************************************** * cl_intel_va_api_media_sharing extension * *******************************************/ #define cl_intel_va_api_media_sharing 1 /* error codes */ #define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 #define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 #define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 #define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 /* cl_va_api_device_source_intel */ #define CL_VA_API_DISPLAY_INTEL 0x4094 /* cl_va_api_device_set_intel */ #define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 #define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 /* cl_context_info */ #define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 /* cl_mem_info */ #define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 /* cl_image_info */ #define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 /* cl_command_type */ #define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A #define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B typedef cl_uint cl_va_api_device_source_intel; typedef cl_uint cl_va_api_device_set_intel; extern CL_API_ENTRY cl_int CL_API_CALL clGetDeviceIDsFromVA_APIMediaAdapterINTEL( cl_platform_id platform, cl_va_api_device_source_intel media_adapter_type, void* media_adapter, cl_va_api_device_set_intel media_adapter_set, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( cl_platform_id platform, cl_va_api_device_source_intel media_adapter_type, void* media_adapter, cl_va_api_device_set_intel media_adapter_set, cl_uint num_entries, cl_device_id* devices, cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_mem CL_API_CALL clCreateFromVA_APIMediaSurfaceINTEL( cl_context context, cl_mem_flags flags, VASurfaceID* surface, cl_uint plane, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( cl_context context, cl_mem_flags flags, VASurfaceID* surface, cl_uint plane, cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueAcquireVA_APIMediaSurfacesINTEL( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; extern CL_API_ENTRY cl_int CL_API_CALL clEnqueueReleaseVA_APIMediaSurfacesINTEL( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( cl_command_queue command_queue, cl_uint num_objects, const cl_mem* mem_objects, cl_uint num_events_in_wait_list, const cl_event* event_wait_list, cl_event* event) CL_EXT_SUFFIX__VERSION_1_2; #ifdef __cplusplus } #endif #endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/cl_version.h ================================================ /******************************************************************************* * Copyright (c) 2018-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __CL_VERSION_H #define __CL_VERSION_H /* Detect which version to target */ #if !defined(CL_TARGET_OPENCL_VERSION) #pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)") #define CL_TARGET_OPENCL_VERSION 220 #endif #if CL_TARGET_OPENCL_VERSION != 100 && \ CL_TARGET_OPENCL_VERSION != 110 && \ CL_TARGET_OPENCL_VERSION != 120 && \ CL_TARGET_OPENCL_VERSION != 200 && \ CL_TARGET_OPENCL_VERSION != 210 && \ CL_TARGET_OPENCL_VERSION != 220 && \ CL_TARGET_OPENCL_VERSION != 300 #pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220, 300). Defaulting to 220 (OpenCL 2.2)") #undef CL_TARGET_OPENCL_VERSION #define CL_TARGET_OPENCL_VERSION 220 #endif /* OpenCL Version */ #if CL_TARGET_OPENCL_VERSION >= 300 && !defined(CL_VERSION_3_0) #define CL_VERSION_3_0 1 #endif #if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2) #define CL_VERSION_2_2 1 #endif #if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1) #define CL_VERSION_2_1 1 #endif #if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0) #define CL_VERSION_2_0 1 #endif #if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2) #define CL_VERSION_1_2 1 #endif #if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1) #define CL_VERSION_1_1 1 #endif #if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0) #define CL_VERSION_1_0 1 #endif /* Allow deprecated APIs for older OpenCL versions. */ #if CL_TARGET_OPENCL_VERSION <= 220 && !defined(CL_USE_DEPRECATED_OPENCL_2_2_APIS) #define CL_USE_DEPRECATED_OPENCL_2_2_APIS #endif #if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS) #define CL_USE_DEPRECATED_OPENCL_2_1_APIS #endif #if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) #define CL_USE_DEPRECATED_OPENCL_2_0_APIS #endif #if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #endif #if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) #define CL_USE_DEPRECATED_OPENCL_1_1_APIS #endif #if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) #define CL_USE_DEPRECATED_OPENCL_1_0_APIS #endif #endif /* __CL_VERSION_H */ ================================================ FILE: GpuMemLatency/OpenCL/include/CL/opencl.h ================================================ /******************************************************************************* * Copyright (c) 2008-2020 The Khronos Group Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ #ifndef __OPENCL_H #define __OPENCL_H #ifdef __cplusplus extern "C" { #endif #include #include #include #include #ifdef __cplusplus } #endif #endif /* __OPENCL_H */ ================================================ FILE: GpuMemLatency/atomic_test.c ================================================ #include "opencltest.h" float int_atomic_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations, short local, uint32_t *time_ms) { cl_int ret; cl_int result = 0; size_t global_item_size = 2; size_t local_item_size = 1; float latency; uint32_t time_diff_ms; uint32_t A = 0; if (local) { local_item_size = 2; } cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL); clFinish(command_queue); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; goto cleanup; } clFinish(command_queue); time_diff_ms = end_timing(); *time_ms = time_diff_ms; latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2; cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); return latency; } float c2c_atomic_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations) { cl_int ret; cl_int result = 0; size_t global_item_size; size_t local_item_size = 1; float latency; uint32_t time_diff_ms; uint32_t A; cl_uint cuCount = getCuCount(); cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(uint32_t), NULL, &ret); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &ret); global_item_size = cuCount; float* result_arr = (float*)malloc(sizeof(float) * cuCount * cuCount); for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++) { for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++) { if (t1_idx == t2_idx) continue; fprintf(stderr, "Testing %d -> %d\n", t1_idx, t2_idx); A = 0; ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t), &A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL); clFinish(command_queue); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&t1_idx); clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&t2_idx); start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; goto cleanup; } clFinish(command_queue); time_diff_ms = end_timing(); latency = (1e6 * (float)time_diff_ms / (float)(iterations)) / 2; fprintf(stderr, "%d -> %d: %f\n", t1_idx, t2_idx, latency); result_arr[t1_idx * cuCount + t2_idx] = latency; } } for (cl_int i = 0; i < cuCount; i++) { printf(",%d", i); } printf("\n"); for (cl_int t1_idx = 0; t1_idx < cuCount; t1_idx++) { printf("%d", t1_idx); for (cl_int t2_idx = 0; t2_idx < cuCount; t2_idx++) { if (t1_idx == t2_idx) printf(",x"); else printf(",%f", result_arr[t1_idx * cuCount + t2_idx]); } printf("\n"); } cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(result_arr); return latency; } float int_atomic_add_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, size_t threads, size_t localsize) { // Loop unroll factor const float opsPerIteration = 8.0f; cl_int ret; int64_t time_diff_ms = 0; float gOpsPerSec; uint32_t iterations = 7000; uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * threads); for (int i = 0; i < threads; i++) A[i] = i; cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * threads, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, sizeof(uint32_t) * threads, A, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations); clFinish(command_queue); while (time_diff_ms < TARGET_TIME_MS / 2) { start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &threads, &localsize, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); gOpsPerSec = 0; goto int_atomic_add_test_end; } clFinish(command_queue); time_diff_ms = end_timing(); float totalOps = (float)iterations * opsPerIteration * (float)threads; gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000); fprintf(stderr, "GOPS: %f, elapsed time: %lld\n", gOpsPerSec, time_diff_ms); iterations = adjust_iterations(iterations, time_diff_ms); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations); } int_atomic_add_test_end: clReleaseMemObject(a_mem_obj); free(A); return gOpsPerSec; } ================================================ FILE: GpuMemLatency/bw_test.c ================================================ #include "opencltest.h" float bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint64_t list_size, uint32_t thread_count, uint32_t local_size, uint32_t skip, uint32_t chase_iterations) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float bandwidth, total_data_gb; cl_int ret; cl_int float4size = list_size / 4; int64_t time_diff_ms; if (skip == 0) { // nemes's read-combining-defeating heuristic uint32_t region_size = list_size * sizeof(float); uint32_t current_region_steps = (uint32_t)(region_size / (local_size * 4)); skip = (chase_iterations + current_region_steps + 1) * local_size * 4; } float* A = (float*)malloc(sizeof(float) * list_size); float* result = (float*)malloc(sizeof(float) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", list_size); } // assume that cl_uint size is 4 bytes, same as float size cl_uint* start_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count); cl_uint* calculated_offsets = (cl_uint*)malloc(sizeof(cl_uint) * thread_count); memset(calculated_offsets, 0, sizeof(uint32_t) * thread_count); for (uint32_t i = 0; i < list_size; i++) { A[i] = (float)(i * 0.5); } // tell each thread where to start for (uint32_t i = 0; i < thread_count; i++) { uint32_t localId = i % local_size; uint32_t groupId = i / local_size; start_offsets[i] = (cl_uint)((groupId * skip * local_size + localId) % (float4size - 1)); // randomly start each workgroup somewhere - ends up being really bad /*cl_uint groupOffset = rand() % (float4size / local_size); start_offsets[i] = (cl_uint)((groupOffset * local_size + localId) % (float4size - 1));*/ } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(float), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(float), A, 0, NULL, NULL); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL); cl_mem start_offsets_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint) * thread_count, NULL, &ret); if (ret != 0) fprintf(stderr, "create buffer for start offsets failed. ret = %d\n", ret); ret = clEnqueueWriteBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(cl_uint) * thread_count, start_offsets, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue write buffer for start offsets failed. ret = %d\n", ret); // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions) clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&float4size); clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&result_obj); clSetKernelArg(kernel, 4, sizeof(cl_int), (void*)&skip); clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&start_offsets_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); bandwidth = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); bandwidth = 0; goto cleanup; } time_diff_ms = end_timing(); // each thread does iterations reads total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9; bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb); ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, start_offsets_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, calculated_offsets, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for start offsets failed. ret = %d\n", ret); clFinish(command_queue); if (memcmp(calculated_offsets, start_offsets, sizeof(uint32_t) * thread_count)) { fprintf(stderr, "mismatch in calculated start offsets\n"); for (uint32_t i = 0; i < thread_count; i++) { if (calculated_offsets[i] != start_offsets[i]) { fprintf(stderr, "At index %u, calculated from GPU = %u, calculated on CPU = %u. skip=%u\n", i, calculated_offsets[i], start_offsets[i], skip); break; } } } //fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); clReleaseMemObject(start_offsets_obj); free(A); free(result); free(start_offsets); free(calculated_offsets); return bandwidth; } float tex_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint64_t width, uint64_t height, uint32_t thread_count, uint32_t local_size, uint32_t randomize, uint32_t chase_iterations, int64_t *time_ms) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float texels = 0; cl_int ret; int64_t time_diff_ms; uint64_t tex_array_size = 3 * width * height; // texture size in bytes cl_mem tex_mem_obj = NULL, a_mem_obj = NULL, result_obj = NULL; float* A = (float*)malloc(sizeof(float) * tex_array_size); float* result = (float*)malloc(sizeof(float) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for %lu x %lu texture\n", width, height); } // fill array for (uint64_t i = 0; i < tex_array_size; i++) { A[i] = randomize ? rand() * 0.2f : (float)(i * 0.5); } // create texture from it //a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, tex_array_size * sizeof(float), A, &ret); //ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, tex_array_size * sizeof(float), A, 0, NULL, NULL); cl_image_desc imageDesc; memset(&imageDesc, 0, sizeof(cl_image_desc)); imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D; imageDesc.image_width = width; imageDesc.image_height = height; //imageDesc.mem_object = a_mem_obj; //imageDesc.buffer = A; cl_image_format imageFormat; imageFormat.image_channel_order = CL_R; imageFormat.image_channel_data_type = CL_FLOAT; tex_mem_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, A, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to create 2d texture: %d\n", ret); goto tex_bw_cleanup; } size_t origin[] = { 0, 0, 0 }; size_t region[] = { width, height, 1 }; ret = clEnqueueWriteImage(command_queue, tex_mem_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to copy 2d texture: %d\n", ret); goto tex_bw_cleanup; } fprintf(stderr, "Created image\n"); // copy array to device result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL); // Set kernel arguments for __kernel void sum_bw_test(__global float* A, int count, int float4size, __global float* ret, int skip, __global int *startPositions) clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); texels = 0; goto tex_bw_cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); texels = 0; goto tex_bw_cleanup; } time_diff_ms = end_timing(); fprintf(stderr, "elapsed time: %lld ms\n", time_diff_ms); // each thread does iterations samples, and each sample returns a 4-wide vector texels = 1000 * (float)(chase_iterations * thread_count * 4 / 1e9) / (float)time_diff_ms; fprintf(stderr, "%u iterations, %u threads, %lu ms\n", chase_iterations, thread_count, time_diff_ms); ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); *time_ms = time_diff_ms; tex_bw_cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(tex_mem_obj); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return texels; } // must be at least as large as local memory test size in kernel // list size in 32-bit elements #define local_mem_bw_test_size 8192 float local_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t *time_ms) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float bandwidth, total_data_gb; cl_int ret; int64_t time_diff_ms; float* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size); float* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4); } for (uint32_t i = 0; i < local_mem_bw_test_size; i++) { A[i] = i + .02; } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(float), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(float), A, 0, NULL, NULL); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); bandwidth = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); bandwidth = 0; goto cleanup; } time_diff_ms = end_timing(); *time_ms = time_diff_ms; // each thread does iterations reads total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9; bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb); ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return bandwidth; } #define buffer_test_size 4096 // 1024x uint4 float buffer_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t* time_ms) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float bandwidth, total_data_gb; cl_int ret; int64_t time_diff_ms; cl_mem result_obj; uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * buffer_test_size); float* result = (uint32_t*)malloc(sizeof(float) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4); } for (uint32_t i = 0; i < buffer_test_size; i++) { A[i] = i + 1; } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_test_size * sizeof(uint32_t), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, buffer_test_size * sizeof(uint32_t), A, 0, NULL, NULL); // handle cl_image stuff cl_image_format imageFormat; imageFormat.image_channel_data_type = CL_UNSIGNED_INT32; imageFormat.image_channel_order = CL_R; cl_image_desc imageDesc; memset(&imageDesc, 0, sizeof(cl_image_desc)); imageDesc.buffer = a_mem_obj; imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; imageDesc.image_width = buffer_test_size; // width in pixels cl_mem tex_obj = tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret); size_t origin[] = { 0, 0, 0 }; size_t region[] = { imageDesc.image_width, 1, 1 }; ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL); result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * thread_count, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); bandwidth = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); bandwidth = 0; goto cleanup; } time_diff_ms = end_timing(); *time_ms = time_diff_ms; // each thread does iterations reads total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9; bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb); ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return bandwidth; } float local_chase_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, uint32_t wave_size, int64_t* time_ms) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float bandwidth, total_data_gb; cl_int ret; int64_t time_diff_ms; uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * local_mem_bw_test_size); uint32_t* result = (uint32_t*)malloc(sizeof(uint32_t) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local_mem_bw_test_size * 4); } for (uint32_t i = 0; i < local_mem_bw_test_size; i++) { // assumes local_mem_bw_test_size is a power of 2. A[i] = i + wave_size & (local_mem_bw_test_size - 1); } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local_mem_bw_test_size * sizeof(uint32_t), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local_mem_bw_test_size * sizeof(uint32_t), A, 0, NULL, NULL); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint32_t) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); bandwidth = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); bandwidth = 0; goto cleanup; } time_diff_ms = end_timing(); *time_ms = time_diff_ms; // each thread does iterations reads total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count) / 1e9; bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; //fprintf(stderr, "%llu ms, %llu GB\n", time_diff_ms, total_data_gb); ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return bandwidth; } #define local64_test_size 2048 float local_64_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t* time_ms) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float bandwidth, total_data_gb; cl_int ret; int64_t time_diff_ms; uint64_t* A = (uint64_t*)malloc(sizeof(uint64_t) * local64_test_size); uint64_t* result = (uint64_t*)malloc(sizeof(uint64_t) * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory for test size %lu KB\n", local64_test_size * 4); } for (uint64_t i = 0; i < local64_test_size; i++) { A[i] = i; } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, local64_test_size * sizeof(uint64_t), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, local64_test_size * sizeof(uint64_t), A, 0, NULL, NULL); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint64_t) * thread_count, NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // writes should be blocking, but are they? start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); bandwidth = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); bandwidth = 0; goto cleanup; } time_diff_ms = end_timing(); *time_ms = time_diff_ms; // each thread does iterations reads total_data_gb = sizeof(float) * ((float)chase_iterations * thread_count + thread_count) / 1e9; bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint64_t) * thread_count, result, 0, NULL, NULL); if (ret != 0) fprintf(stderr, "enqueue read buffer for result failed. ret = %d\n", ret); clFinish(command_queue); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return bandwidth; } // default test sizes for link bandwidth const uint64_t default_link_test_sizes[] = { 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152 }; void link_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations) { cl_int ret; cl_int result = 0; size_t global_item_size; size_t local_item_size = 1; float gpu_to_host_bandwidth, host_to_gpu_bandwidth, total_data_gb; uint32_t time_diff_ms, loop_iterations; uint32_t* A; int test_size_count = sizeof(default_link_test_sizes) / sizeof(unsigned long long); float* results = (float*)malloc(sizeof(float) * 2 * test_size_count); memset(results, 0, sizeof(float) * 2 * test_size_count); printf("Copy Size (KB), Host to GPU (GB/s), GPU to Host (GB/s)\n"); for (int size_idx = 0; size_idx < test_size_count; size_idx++) { uint64_t testSizeBytes = default_link_test_sizes[size_idx] * 1024; uint64_t testSizeKb = default_link_test_sizes[size_idx]; if (testSizeBytes > max_global_test_size) { printf("%d K would exceed device's max buffer size of %lu K, stopping here.\n", testSizeKb, max_global_test_size / 1024); break; } A = (uint32_t*)malloc(testSizeBytes); memset(A, 0, testSizeBytes); cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, testSizeBytes, NULL, &ret); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); global_item_size = 1; // only hit the first element, not like we're going to spend time verifying an entire arr especially at large sizes // use 1M iterations = 1 GB total to transfer loop_iterations = ((uint64_t)iterations * 1000) / (uint64_t)testSizeBytes; //fprintf(stderr, "Size: %llu KB, Iterations: %d, base iterations: %d\n", testSizeKb, loop_iterations, iterations); start_timing(); for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++) { ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL); clFinish(command_queue); } time_diff_ms = end_timing(); total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9; host_to_gpu_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; results[size_idx * 2] = host_to_gpu_bandwidth; //fprintf(stderr, "Write to GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms); start_timing(); for (int iter_idx = 0; iter_idx < loop_iterations; iter_idx++) { ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0, testSizeBytes, A, 0, NULL, NULL); clFinish(command_queue); } time_diff_ms = end_timing(); total_data_gb = ((float)loop_iterations * testSizeBytes) / 1e9; gpu_to_host_bandwidth = 1000 * (float)total_data_gb / (float)time_diff_ms; results[size_idx * 2 + 1] = gpu_to_host_bandwidth; //fprintf(stderr, "Read from GPU: %f GB transferred in %d ms\n", total_data_gb, time_diff_ms); printf("%llu,%f,%f\n", testSizeKb, host_to_gpu_bandwidth, gpu_to_host_bandwidth); clReleaseMemObject(a_mem_obj); free(A); } float max = 0; for (int size_idx = 0; size_idx < test_size_count; size_idx++) { if (results[size_idx * 2] > max) max = results[size_idx * 2]; if (results[size_idx * 2 + 1] > max) max = results[size_idx * 2 + 1]; } printf("Link bandwidth: %f GB/s\n", max); cleanup: free(results); clFlush(command_queue); clFinish(command_queue); } ================================================ FILE: GpuMemLatency/common.c ================================================ #include "opencltest.h" cl_device_id selected_device_id; cl_platform_id selected_platform_id; cl_ulong max_global_test_size; int saveprogram = 0; // Fills an array using Sattolo's algo void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) { uint32_t increment = byte_increment / sizeof(uint32_t); uint32_t element_count = list_size / increment; for (int i = 0; i < element_count; i++) { pattern_arr[i * increment] = i * increment; } int iter = element_count; while (iter > 1) { iter -= 1; int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); uint32_t tmp = pattern_arr[iter * increment]; pattern_arr[iter * increment] = pattern_arr[j * increment]; pattern_arr[j * increment] = tmp; } } cl_uint getCuCount() { cl_uint cuCount; size_t cuCountLen = sizeof(cl_uint); if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_COMPUTE_UNITS, cuCountLen, &cuCount, &cuCountLen)) { fprintf(stderr, "Could not get number of compute units\n"); return 0; } return cuCount; } size_t getMaxWorkgroupSize() { size_t maxWorkgroupSize; size_t workgroupSizeLen = sizeof(size_t); if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, workgroupSizeLen, &maxWorkgroupSize, &workgroupSizeLen)) { fprintf(stderr, "Could not get number of compute units\n"); return 0; } return maxWorkgroupSize; } cl_ulong get_max_constant_buffer_size() { cl_ulong constant_buffer_size = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &constant_buffer_size, NULL)) { fprintf(stderr, "Failed to get max constant buffer size\n"); } return constant_buffer_size; } cl_ulong get_max_buffer_size() { cl_ulong buffer_size = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) { fprintf(stderr, "Failed to get max constant buffer size\n"); } return buffer_size; } cl_ulong get_max_tex_buffer_size() { cl_ulong buffer_size = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(cl_ulong), &buffer_size, NULL)) { fprintf(stderr, "Failed to get max texture buffer size\n"); } return buffer_size; } cl_ulong get_max_2d_tex_width() { cl_ulong max_width = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(cl_ulong), &max_width, NULL)) { fprintf(stderr, "Failed to get max texture width\n"); } return max_width; } cl_ulong get_max_2d_tex_height() { cl_ulong max_width = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(cl_ulong), &max_width, NULL)) { fprintf(stderr, "Failed to get max texture height\n"); } return max_width; } short checkExtensionSupport(const char *extension_name) { size_t extensionLen = 0; char* extensions; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, 0, NULL, &extensionLen)) { fprintf(stderr, "Could not determine memory needed to hold OpenCL extension list\n"); return 0; } extensions = (char *)malloc(extensionLen + 1); extensions[extensionLen] = 0; if (CL_SUCCESS != clGetDeviceInfo(selected_device_id, CL_DEVICE_EXTENSIONS, extensionLen, extensions, &extensionLen)) { fprintf(stderr, "Could not get OpenCL extensions list\n"); return 0; } //fprintf(stderr, "OpenCL extensions list: %s\n", extensions); // extension list is space separated size_t spaceCount = 0; for (int i = 0; i < extensionLen; i++) { if (extensions[i] == ' ') spaceCount++; } int* extensionsSpaces = (int*)malloc(sizeof(int) * (spaceCount + 1)); extensionsSpaces[0] = 0; int spaceIdx = 1; for (int i = 0; i < extensionLen; i++) { if (extensions[i] == ' ') { extensions[i] = 0; extensionsSpaces[spaceIdx] = i + 1; spaceIdx++; } } short found = 0; for (int i = 0; i < spaceCount; i++) { //fprintf(stderr, "Looking for %s = %s\n", extension_name, extensions + extensionsSpaces[i]); if (strcmp(extension_name, extensions + extensionsSpaces[i]) == 0) { found = 1; //fprintf(stderr, "found\n"); break; } } free(extensionsSpaces); free(extensions); return found; } /// /// populate global variables for opencl device id and platform id /// /// platform index. if -1, prompt user /// device index. if -1. prompt user /// opencl context cl_context get_context_from_user(int platform_index, int device_index) { int i = 0; int selected_platform_index = 0, selected_device_index = 0; // Get platform and device information cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms); cl_platform_id* platforms = NULL; cl_device_id* devices = NULL; cl_context context = NULL; platforms = (cl_platform_id*)malloc(ret_num_platforms * sizeof(cl_platform_id)); ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL); fprintf(stderr, "clGetPlatformIDs returned %d. %d platforms\n", ret, ret_num_platforms); for (i = 0; i < ret_num_platforms; i++) { size_t platform_name_len; char* platform_name = NULL; if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &platform_name_len)) { fprintf(stderr, "Failed to get platform info for platform %d\n", i); continue; } platform_name = (char*)malloc(platform_name_len + 1); platform_name[platform_name_len] = 0; if (CL_SUCCESS != clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_len, platform_name, NULL)) { fprintf(stderr, "Failed to get platform name for platform %d\n", i); free(platform_name); continue; } fprintf(stderr, "Platform %d: %s\n", i, platform_name); free(platform_name); } selected_platform_index = platform_index; if (selected_platform_index == -1) { printf("Enter platform #:"); scanf("%d", &selected_platform_index); } if (selected_platform_index > ret_num_platforms - 1) { fprintf(stderr, "platform index out of range\n"); goto get_context_from_user_end; } selected_platform_id = platforms[selected_platform_index]; if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &ret_num_devices)) { fprintf(stderr, "Failed to enumerate device ids for platform"); return NULL; } devices = (cl_device_id*)malloc(ret_num_devices * sizeof(cl_device_id)); if (CL_SUCCESS != clGetDeviceIDs(selected_platform_id, CL_DEVICE_TYPE_ALL, ret_num_devices, devices, NULL)) { fprintf(stderr, "Failed to get device ids for platform"); free(devices); return NULL; } fprintf(stderr, "clGetDeviceIDs returned %d devices\n", ret_num_devices); for (i = 0; i < ret_num_devices; i++) { size_t device_name_len; char* device_name = NULL; if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &device_name_len)) { fprintf(stderr, "Failed to get name length for device %d\n", i); continue; } //fprintf(stderr, "debug: device name length: %d\n", device_name_len); device_name = (char*)malloc(device_name_len + 1); device_name[device_name_len] = 0; if (CL_SUCCESS != clGetDeviceInfo(devices[i], CL_DEVICE_NAME, device_name_len, device_name, &device_name_len)) { fprintf(stderr, "Failed to get name for device %d\n", i); free(device_name); continue; } fprintf(stderr, "Device %d: %s\n", i, device_name); free(device_name); } selected_device_index = device_index; if (selected_device_index == -1) { fprintf(stderr, "Enter device #:"); scanf("%d", &selected_device_index); } if (selected_device_index > ret_num_devices - 1) { fprintf(stderr, "Device index out of range\n"); goto get_context_from_user_end; } selected_device_id = devices[selected_device_index]; // Create an OpenCL context context = clCreateContext(NULL, 1, &selected_device_id, NULL, NULL, &ret); fprintf(stderr, "clCreateContext returned %d\n", ret); fprintf(stderr, "Max workgroup size for device: %u\n", getMaxWorkgroupSize()); get_context_from_user_end: free(platforms); free(devices); return context; } cl_program build_program(cl_context context, const char* fname, const char *params) { cl_int ret; FILE* fp = NULL; char* source_str; size_t source_size; fp = fopen(fname, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel %s.\n", fname); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret); ret = clBuildProgram(program, 1, &selected_device_id, params, NULL, NULL); //fprintf(stderr, "clBuildProgram %s returned %d\n", fname, ret); if (ret == -11) { size_t log_size; fprintf(stderr, "OpenCL kernel build error\n"); clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char* log = (char*)malloc(log_size); clGetProgramBuildInfo(program, selected_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); fprintf(stderr, "%s\n", log); free(log); } free(source_str); return program; } void write_program(cl_program program, const char *name) { size_t* binarySizes = NULL; size_t nDevices = 0; cl_int ret, memoryRequired = 0; char fname[255]; int i; unsigned char** binaries = NULL; ret = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, sizeof(size_t), &nDevices, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Could not get number of devices for program\n"); return; } fprintf(stderr, "Program is associated with %llu devices\n", nDevices); binarySizes = (size_t*)malloc(sizeof(size_t) * nDevices); if (binarySizes == NULL) { fprintf(stderr, "Failed to allocate memory for binary sizes\n"); goto getProgram_Fail; } ret = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * nDevices, binarySizes, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Could not get program binary sizes\n"); goto getProgram_Fail; } binaries = (unsigned char*)malloc(nDevices); for (i = 0; i < nDevices; i++) { fprintf(stderr, "Device %d: %llu byte program\n", i, binarySizes[i]); binaries[i] = (char*)malloc(binarySizes[i]); } ret = clGetProgramInfo(program, CL_PROGRAM_BINARIES, nDevices * sizeof(unsigned char*), binaries, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Could not get program binaries\n"); goto getProgram_Fail; } for (int i = 0; i < nDevices; i++) { snprintf(fname, 254, "prog%d_%s", i, name); FILE* dst = fopen(fname, "w"); fwrite(binaries[i], 1, binarySizes[i], dst); fclose(dst); fprintf(stderr, "Wrote compiled kernel to %s\n", fname); } getProgram_Fail: for (int i = 0; i < nDevices; i++) free(binaries[i]); free(binaries); free(binarySizes); } // Given last run settings, return target iteration count that should make the next run // go for approximately TARGET_TIME_MS uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms) { uint32_t chase_iterations = (uint32_t)((float)iterations * TARGET_TIME_MS / (float)time_ms); if (time_ms == 0) chase_iterations = iterations * 100; //fprintf(stderr, "Kernel took %llu ms. Setting iterations = %u\n", time_ms, chase_iterations); return chase_iterations; } ================================================ FILE: GpuMemLatency/instruction_rate.c ================================================ #include "opencltest.h" float fp64_instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result); float fp16_instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result); float run_rate_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result, float totalOps); float run_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result, float opsPerIteration); float global_totalOps; float instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int forcefp16, int forcefp64) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float gOpsPerSec = 0, opsPerIteration; cl_int ret; int64_t time_diff_ms; int float4_element_count = thread_count * 4; cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL); if (saveprogram) write_program(program, "irate"); cl_kernel int32_add_rate_kernel = clCreateKernel(program, "int32_add_rate_test", &ret); cl_kernel int32_mul_rate_kernel = clCreateKernel(program, "int32_mul_rate_test", &ret); cl_kernel fp32_add_rate_kernel = clCreateKernel(program, "fp32_add_rate_test", &ret); cl_kernel fp32_fma_rate_kernel = clCreateKernel(program, "fp32_fma_rate_test", &ret); cl_kernel fp32_builtin_fma_rate_kernel = clCreateKernel(program, "fp32_builtin_fma_rate_test", &ret); cl_kernel fp32_mad_rate_kernel = clCreateKernel(program, "fp32_mad_rate_test", &ret); cl_kernel fp32_rcp_rate_kernel = clCreateKernel(program, "fp32_rcp_rate_test", &ret); cl_kernel fp32_rsqrt_rate_kernel = clCreateKernel(program, "fp32_rsqrt_rate_test", &ret); cl_kernel mix_fp32_int32_add_rate_kernel = clCreateKernel(program, "mix_fp32_int32_add_rate_test", &ret); cl_kernel mix_fp32_int32_addmul_rate_kernel = clCreateKernel(program, "mix_fp32_int32_addmul_rate_test", &ret); cl_kernel int64_add_rate_kernel = clCreateKernel(program, "int64_add_rate_test", &ret); cl_kernel int64_mul_rate_kernel = clCreateKernel(program, "int64_mul_rate_test", &ret); cl_kernel int16_add_rate_kernel = clCreateKernel(program, "int16_add_rate_test", &ret); cl_kernel int16_mul_rate_kernel = clCreateKernel(program, "int16_mul_rate_test", &ret); cl_kernel int8_add_rate_kernel = clCreateKernel(program, "int8_add_rate_test", &ret); cl_kernel int8_mul_rate_kernel = clCreateKernel(program, "int8_mul_rate_test", &ret); cl_kernel fp32_fma_latency_kernel = clCreateKernel(program, "fp32_fma_latency_test", &ret); cl_kernel fp32_add_latency_kernel = clCreateKernel(program, "fp32_add_latency_test", &ret); cl_kernel int32_add_latency_kernel = clCreateKernel(program, "int32_add_latency_test", &ret); cl_kernel int32_mul_latency_kernel = clCreateKernel(program, "int32_mul_latency_test", &ret); cl_kernel int32_add_scalar_latency_kernel = clCreateKernel(program, "int32_add_scalar_latency_test", &ret); cl_kernel int32_mul_scalar_latency_kernel = clCreateKernel(program, "int32_mul_scalar_latency_test", &ret); cl_kernel fp32_add_scalar_latency_kernel = clCreateKernel(program, "fp32_add_scalar_latency_test", &ret); cl_kernel fp32_fma_scalar_latency_kernel = clCreateKernel(program, "fp32_fma_scalar_latency_test", &ret); cl_kernel fp32_mul_scalar_latency_kernel = clCreateKernel(program, "fp32_mul_scalar_latency_test", &ret); cl_kernel fp32_mul_latency_kernel = clCreateKernel(program, "fp32_mul_latency_test", &ret); float* A = (float*)malloc(sizeof(float) * float4_element_count * 4); float* result = (float*)malloc(sizeof(float) * 4 * thread_count); if (!A || !result) { fprintf(stderr, "Failed to allocate memory instruction rate test\n"); } cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, float4_element_count * sizeof(float), NULL, &ret); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * 4 * thread_count, NULL, &ret); // Integer test first uint32_t *int32_A = (uint32_t*)A; for (int i = 0; i < float4_element_count * 4; i++) { int32_A[i] = i + 1; } // 4x int4 * 8 per iteration, and count the loop increment too opsPerIteration = 4.0f * 8.0f; float int32_add_rate = run_rate_test(context, command_queue, int32_add_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT32 G Adds/sec: %f\n", int32_add_rate); printf("===== INT32 add latency =====\n"); float int32_add_latency = run_latency_test(context, command_queue, int32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "INT32 add latency: %f ns\n", int32_add_latency); printf("===== INT32 add latency (scalar) =====\n"); int32_add_latency = run_latency_test(context, command_queue, int32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "INT32 add latency (scalar): %f ns\n", int32_add_latency); printf("===== INT32 mul latency =====\n"); float int32_mul_latency = run_latency_test(context, command_queue, int32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "INT32 mul latency: %f ns\n", int32_mul_latency); printf("===== INT32 mul latency (scalar) =====\n"); int32_mul_latency = run_latency_test(context, command_queue, int32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "INT32 mul latency (scalar): %f ns\n", int32_mul_latency); opsPerIteration = 4.0f * 8.0f; float int32_mul_rate = run_rate_test(context, command_queue, int32_mul_rate_kernel, thread_count, local_size, (chase_iterations / 2), float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT32 G Multiplies/sec: %f\n", int32_mul_rate); // FP32 add and fma test cl_float* fp32_A = (cl_float*)A; for (int i = 0; i < float4_element_count * 4; i++) { fp32_A[i] = 0.5f * i; } opsPerIteration = 4.0f * 8.0f; float fp32_add_rate = run_rate_test(context, command_queue, fp32_add_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G Adds/sec: %f\n", fp32_add_rate); printf("===== FP32 add latency =====\n"); float fp32_add_latency = run_latency_test(context, command_queue, fp32_add_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 add latency: %f ns\n", fp32_add_latency); printf("===== FP32 add latency (scalar) =====\n"); fp32_add_latency = run_latency_test(context, command_queue, fp32_add_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 add latency (scalar): %f ns\n", fp32_add_latency); printf("===== FP32 fma latency =====\n"); float fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 FMA latency: %f ns\n", fp32_fma_latency); printf("===== FP32 fma latency (scalar) =====\n"); fp32_fma_latency = run_latency_test(context, command_queue, fp32_fma_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 FMA latency (scalar): %f ns\n", fp32_fma_latency); printf("===== FP32 mul latency =====\n"); fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 mul latency: %f ns\n", fp32_fma_latency); fp32_fma_latency = run_latency_test(context, command_queue, fp32_mul_scalar_latency_kernel, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, 8.0f); fprintf(stderr, "FP32 mul latency (scalar): %f ns\n", fp32_fma_latency); float fp32_fma_rate = run_rate_test(context, command_queue, fp32_fma_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G FMA/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2); float builtin_fp32_fma_rate = run_rate_test(context, command_queue, fp32_builtin_fma_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G fma()/sec: %f : %f GFLOPs\n", builtin_fp32_fma_rate, builtin_fp32_fma_rate * 2); fp32_fma_rate = run_rate_test(context, command_queue, fp32_mad_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G mad()/sec: %f : %f GFLOPs\n", fp32_fma_rate, fp32_fma_rate * 2); float fp32_rcp_rate = run_rate_test(context, command_queue, fp32_rcp_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G native_recip/sec: %f\n", fp32_rcp_rate); float fp32_rsqrt_rate = run_rate_test(context, command_queue, fp32_rsqrt_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "FP32 G native_rsqrt/sec: %f\n", fp32_rsqrt_rate); // Mixed INT32 and FP32 - 4 FP32, 4 INT32, and the loop increment // takes FP inputs and converts some to int opsPerIteration = 4.0f * 8.0f + 1.0f; float mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_add_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "Mixed INT32 and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate); // Test the same with integer multiplies mix_fp32_int32_rate = run_rate_test(context, command_queue, mix_fp32_int32_addmul_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "Mixed INT32 Multiplies and FP32 G Adds/sec: %f\n", mix_fp32_int32_rate); // INT64 add test cl_ulong* int64_A = (cl_ulong*)A; for (int i = 0; i < float4_element_count * 2; i++) { int64_A[i] = i * 2; } opsPerIteration = 2.0f * 8.0f; float int64_add_rate = run_rate_test(context, command_queue, int64_add_rate_kernel, thread_count, local_size, chase_iterations / 2, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT64 G Adds/sec: %f\n", int64_add_rate); opsPerIteration = 2.0f * 8.0f; float int64_mul_rate = run_rate_test(context, command_queue, int64_mul_rate_kernel, thread_count, local_size, chase_iterations / 8, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT64 G Multiplies/sec: %f\n", int64_mul_rate); // INT16 (short) tests cl_ushort* int16_A = (cl_ushort*)A; for (int i = 0; i < float4_element_count * 8; i++) { int16_A[i] = i; } // short8 opsPerIteration = 8.0f * 8.0f; float int16_add_rate = run_rate_test(context, command_queue, int16_add_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT16 G Adds/sec: %f\n", int16_add_rate); float int16_mul_rate = run_rate_test(context, command_queue, int16_mul_rate_kernel, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT16 G Multiplies/sec: %f \n", int16_mul_rate); // INT8 (char) tests cl_char* int8_A = (cl_char*)A; for (int i = 0; i < float4_element_count * 8; i++) { int8_A[i] = i; } uint32_t int8_chase_iterations = chase_iterations / 10; opsPerIteration = 16.0f * 8.0f; float int8_add_rate = run_rate_test(context, command_queue, int8_add_rate_kernel, thread_count, local_size, int8_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT8 G Adds/sec: %f\n", int8_add_rate); float int8_mul_rate = run_rate_test(context, command_queue, int8_mul_rate_kernel, thread_count, local_size, int8_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, opsPerIteration); fprintf(stderr, "INT8 G Multiplies/sec: %f\n", int8_mul_rate); short checkExtensionSupport(const char *extension_name); if (checkExtensionSupport("cl_khr_fp64") || forcefp64) { fp64_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result); } else { fprintf(stderr, "FP64 not supported\n"); } if (checkExtensionSupport("cl_khr_fp16") || forcefp16) { fp16_instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result); } else { fprintf(stderr, "FP16 not supported\n"); } cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); return gOpsPerSec; } // Runs an instruction rate test. The kernel is expected to perform opsPerIteration * chase_iterations operations // Mostly simplifies the uber instruction rate test above. Expects memory to be pre-allocated for example. // Returns GOPS float run_rate_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result, float opsPerIteration) { size_t global_item_size = thread_count; size_t local_item_size = local_size; cl_int ret; float totalOps, gOpsPerSec; uint64_t time_diff_ms = 0; memset(result, 0, sizeof(float) * 4 * thread_count); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4 * thread_count, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); //fprintf(stderr, "Submitting fp32 add kernel to command queue\n"); // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment while (time_diff_ms < TARGET_TIME_MS / 2) { start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); gOpsPerSec = 0; return 0; } ret = clFinish(command_queue); if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); gOpsPerSec = 0; return 0; } time_diff_ms = end_timing(); totalOps = (float)chase_iterations * opsPerIteration * (float)thread_count; gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000); //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count); //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms); chase_iterations = adjust_iterations(chase_iterations, time_diff_ms); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); } return gOpsPerSec; } // Variation of the test above but input array size is aligned with assumed wave size. // if partitioning pattern, this will test partitioning with active waves in the specified pattern float run_divergence_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t wave, int *partitionPattern) { size_t global_item_size = thread_count; size_t local_item_size = local_size; uint32_t active_threads = thread_count; cl_int ret; float totalOps, gOpsPerSec; uint64_t time_diff_ms = 0; uint32_t chase_iterations = 2500000; cl_program program = build_program(context, "instruction_rate_kernel.cl", NULL); cl_kernel kernel = clCreateKernel(program, partitionPattern == NULL ? "fp32_divergence_rate_test" : "fp32_partition_rate_test", &ret); float* result = (float*)malloc(sizeof(float) * thread_count); float* A = (float*)malloc(sizeof(float) * thread_count); memset(result, 0, sizeof(float) * thread_count); if (partitionPattern != NULL) active_threads = 0; if (partitionPattern != NULL) fprintf(stderr, "\n"); for (int i = 0; i < thread_count; i++) { if (partitionPattern == NULL) { // divergence test if ((i / wave) % 2 == 0) A[i] = 0.2f; else A[i] = 0.8f; } else { if (partitionPattern[(i / wave)]) { A[i] = 0.2f; fprintf(stderr, "a "); active_threads++; } else { fprintf(stderr, "_ "); A[i] = 1.2f; } if ((i + 1) % wave == 0) { fprintf(stderr, "\n"); } } } if (partitionPattern != NULL) fprintf(stderr, "\nActive threads: %d\n", active_threads); cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, thread_count * sizeof(float), NULL, &ret); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, thread_count * sizeof(float), NULL, &ret); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, thread_count * sizeof(float), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, thread_count * sizeof(float), result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment while (time_diff_ms < TARGET_TIME_MS / 2) { start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); gOpsPerSec = 0; return 0; } ret = clFinish(command_queue); if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); gOpsPerSec = 0; return 0; } time_diff_ms = end_timing(); totalOps = (float)chase_iterations * 8 * (float)active_threads; gOpsPerSec = ((float)totalOps / 1e9) / ((float)time_diff_ms / 1000); //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count); //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms); chase_iterations = adjust_iterations(chase_iterations, time_diff_ms); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); } clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); free(result); clReleaseKernel(kernel); clReleaseProgram(program); return gOpsPerSec; } // often takes time for clocks to settle? #define LATENCY_REPEAT 5 float run_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result, float opsPerIteration) { size_t global_item_size = 1; size_t local_item_size = 1; cl_int ret; float latency; uint64_t time_diff_ms = 0; // hack around latency taking longer chase_iterations = chase_iterations / 50; // testing returning a float4 memset(result, 0, sizeof(float) * 4); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, float4_element_count * sizeof(float), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(float) * 4, result, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); clFinish(command_queue); //fprintf(stderr, "Submitting fp32 add kernel to command queue\n"); // start with a low iteration count and try to make it work for all GPUs without needing manual iteration adjustment while (time_diff_ms < TARGET_TIME_MS / 2) { start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; return 0; } ret = clFinish(command_queue); if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); latency = 0; return 0; } time_diff_ms = end_timing(); chase_iterations = adjust_iterations(chase_iterations, time_diff_ms); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); } float totalOps = (float)chase_iterations * opsPerIteration * (float)global_item_size; latency = (float)time_diff_ms * 1e6 / totalOps; // fprintf(stderr, "\tinitial run: %f ns latency\n", latency); float minLatency = 0.0f; for (int i = 0; i < LATENCY_REPEAT; i++) { start_timing(); clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); clFinish(command_queue); time_diff_ms = end_timing(); latency = (float)time_diff_ms * 1e6 / totalOps; // fprintf(stderr, "\trun %d: %f ns latency\n", i, latency); if (i == 0 || latency < minLatency) minLatency = latency; } //fprintf(stderr, "chase iterations: %d, thread count: %d\n", chase_iterations, thread_count); //fprintf(stderr, "total ops: %f (%.2f G)\ntotal time: %llu ms\n", totalOps, totalOps / 1e9, time_diff_ms); return minLatency; } // taking out FP64 because some implementations don't support it. putting another build program + create kernel section // in the main instruction rate test function would be too messy float fp64_instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float *A, cl_float*result) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float gOpsPerSec, totalOps; cl_int ret; int64_t time_diff_ms; // FP64 add test uint32_t low_chase_iterations = chase_iterations / 4; cl_double* fp64_A = (cl_double*)A; for (int i = 0; i < float4_element_count * 2; i++) { fp64_A[i] = 0.5f * i; } memset(result, 0, sizeof(float) * 4 * thread_count); cl_program program = build_program(context, "instruction_rate_fp64_kernel.cl", NULL); if (saveprogram) write_program(program, "fp64irate"); cl_kernel fp64_add_rate_kernel = clCreateKernel(program, "fp64_add_rate_test", &ret); cl_kernel fp64_fma_rate_kernel = clCreateKernel(program, "fp64_fma_rate_test", &ret); cl_kernel fp64_mad_rate_kernel = clCreateKernel(program, "fp64_mad_rate_test", &ret); totalOps = 2.0f * 8.0f; gOpsPerSec = run_rate_test(context, command_queue, fp64_add_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP64 G Adds/sec: %f\n", gOpsPerSec); gOpsPerSec = run_rate_test(context, command_queue, fp64_fma_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP64 G FMAs/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2); gOpsPerSec = run_rate_test(context, command_queue, fp64_mad_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP64 G mad()/sec: %f : %f FP64 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2); return gOpsPerSec; } // taking out FP16 too because it requires an extension to be supported float fp16_instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int float4_element_count, cl_mem a_mem_obj, cl_mem result_obj, cl_float* A, cl_float* result) { size_t global_item_size = thread_count; size_t local_item_size = local_size; float gOpsPerSec, totalOps; cl_int ret; int64_t time_diff_ms; // FP64 add test uint32_t low_chase_iterations = chase_iterations / 4; cl_half* fp16_A = (cl_float*)A; for (int i = 0; i < float4_element_count * 8; i++) { fp16_A[i] = (cl_half)(0.5f * i); } memset(result, 0, sizeof(float) * 4 * thread_count); cl_program program = build_program(context, "instruction_rate_fp16_kernel.cl", NULL); if (saveprogram) write_program(program, "fp16irate"); cl_kernel fp16_add_rate_kernel = clCreateKernel(program, "fp16_add_rate_test", &ret); cl_kernel fp16_fma_rate_kernel = clCreateKernel(program, "fp16_fma_rate_test", &ret); //cl_kernel fp16_rsqrt_rate_kernel = clCreateKernel(program, "fp16_rsqrt_rate_test", &ret); totalOps = 8.0f * 8.0f; gOpsPerSec = run_rate_test(context, command_queue, fp16_add_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP16 G Adds/sec: %f\n", gOpsPerSec); gOpsPerSec = run_rate_test(context, command_queue, fp16_fma_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP16 G FMAs/sec: %f : %f FP16 GFLOPs\n", gOpsPerSec, gOpsPerSec * 2); /*gOpsPerSec = run_rate_test(context, command_queue, fp16_rsqrt_rate_kernel, thread_count, local_size, low_chase_iterations, float4_element_count, a_mem_obj, result_obj, A, result, totalOps); fprintf(stderr, "FP16 G native_rsqrt/sec: %f\n", gOpsPerSec);*/ return gOpsPerSec; } ================================================ FILE: GpuMemLatency/instruction_rate_fp16_kernel.cl ================================================ #pragma OPENCL EXTENSION cl_khr_fp16 : enable #define rate_local_mem_test_size 256 __kernel void fp16_add_rate_test(__global half8 *A, int count, __global half8 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global half8 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); half8 v0 = local_a[masked_tid]; half8 v1 = local_a[masked_tid + 1]; half8 v2 = local_a[masked_tid + 2]; half8 v3 = local_a[masked_tid + 3]; half8 v4 = v0 + v1; half8 v5 = v0 + v2; half8 v6 = v0 + v3; half8 v7 = v1 + v2; half8 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp16_fma_rate_test(__global half8 *A, int count, __global half8 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global half8 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); half8 v0 = local_a[masked_tid]; half8 v1 = local_a[masked_tid + 1]; half8 v2 = local_a[masked_tid + 2]; half8 v3 = local_a[masked_tid + 3]; half8 v4 = v0 + v1; half8 v5 = v0 + v2; half8 v6 = v0 + v3; half8 v7 = v1 + v2; half8 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc * v0; v1 += acc * v1; v2 += acc * v2; v3 += acc * v3; v4 += acc * v4; v5 += acc * v5; v6 += acc * v6; v7 += acc * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } /*__kernel void fp16_rsqrt_rate_test(__global half8 *A, int count, __global half8 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global half8 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); half8 v0 = local_a[masked_tid]; half8 v1 = local_a[masked_tid + 1]; half8 v2 = local_a[masked_tid + 2]; half8 v3 = local_a[masked_tid + 3]; half8 v4 = v0 + v1; half8 v5 = v0 + v2; half8 v6 = v0 + v3; half8 v7 = v1 + v2; for (int i = 0; i < count; i++) { v0 = native_rsqrt(v0); v1 = native_rsqrt(v1); v2 = native_rsqrt(v2); v3 = native_rsqrt(v3); v4 = native_rsqrt(v4); v5 = native_rsqrt(v5); v6 = native_rsqrt(v6); v7 = native_rsqrt(v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } */ ================================================ FILE: GpuMemLatency/instruction_rate_fp64_kernel.cl ================================================ #define rate_local_mem_test_size 256 __kernel void fp64_add_rate_test(__global double2 *A, int count, __global double2 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global double2 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); double2 v0 = local_a[masked_tid]; double2 v1 = local_a[masked_tid + 1]; double2 v2 = local_a[masked_tid + 2]; double2 v3 = local_a[masked_tid + 3]; double2 v4 = v0 + v1; double2 v5 = v0 + v2; double2 v6 = v0 + v3; double2 v7 = v1 + v2; double2 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp64_fma_rate_test(__global double2 *A, int count, __global double2 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global double2 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); double2 v0 = local_a[masked_tid]; double2 v1 = local_a[masked_tid + 1]; double2 v2 = local_a[masked_tid + 2]; double2 v3 = local_a[masked_tid + 3]; double2 v4 = v0 + v1; double2 v5 = v0 + v2; double2 v6 = v0 + v3; double2 v7 = v1 + v2; double2 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc * v0; v1 += acc * v1; v2 += acc * v2; v3 += acc * v3; v4 += acc * v4; v5 += acc * v5; v6 += acc * v6; v7 += acc * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp64_mad_rate_test(__global double2 *A, int count, __global double2 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global double2 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); double2 v0 = local_a[masked_tid]; double2 v1 = local_a[masked_tid + 1]; double2 v2 = local_a[masked_tid + 2]; double2 v3 = local_a[masked_tid + 3]; double2 v4 = v0 + v1; double2 v5 = v0 + v2; double2 v6 = v0 + v3; double2 v7 = v1 + v2; double2 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 = mad(acc, v0, v0); v1 = mad(acc, v1, v1); v2 = mad(acc, v2, v2); v3 = mad(acc, v3, v3); v4 = mad(acc, v4, v3); v5 = mad(acc, v5, v5); v6 = mad(acc, v6, v6); v7 = mad(acc, v7, v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } ================================================ FILE: GpuMemLatency/instruction_rate_kernel.cl ================================================ #define rate_local_mem_test_size 512 // A must be at least (local size * 4) uint32 elements in size, but must not exceed local mem size // jk it doesn't use local mem now __kernel void int32_add_rate_test(__global uint4 *A, int count, __global uint4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __local uint4 local_a[rate_local_mem_test_size]; for (int i = tid;i < rate_local_mem_test_size; i += max_offset) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // __global uint4 *local_a = A; int masked_tid = min(tid, rate_local_mem_test_size - 8); uint4 v0 = local_a[masked_tid]; uint4 v1 = local_a[masked_tid + 1]; uint4 v2 = local_a[masked_tid + 2]; uint4 v3 = local_a[masked_tid + 3]; uint4 v4 = local_a[masked_tid + 4]; uint4 v5 = local_a[masked_tid + 5]; uint4 v6 = local_a[masked_tid + 6]; uint4 v7 = local_a[masked_tid + 7]; for (int i = 0; i < count; i++) { uint4 acc = local_a[i & (rate_local_mem_test_size - 1)]; v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int32_mul_rate_test(__global uint4 *A, int count, __global uint4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global uint4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); uint4 v0 = local_a[masked_tid]; uint4 v1 = local_a[masked_tid + 1]; uint4 v2 = local_a[masked_tid + 2]; uint4 v3 = local_a[masked_tid + 3]; uint4 v4 = v0 + v1; uint4 v5 = v0 + v2; uint4 v6 = v0 + v3; uint4 v7 = v1 + v2; uint4 acc = local_a[0]; for (int i = 0; i < count; i++) { //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1]; v0 *= acc; v1 *= acc; v2 *= acc; v3 *= acc; v4 *= acc; v5 *= acc; v6 *= acc; v7 *= acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_add_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = v0 + v1; float4 v5 = v0 + v2; float4 v6 = v0 + v3; float4 v7 = v1 + v2; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { //float4 acc = local_a[i & (rate_local_mem_test_size) - 1]; v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_rcp_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = v0 + v1; float4 v5 = v0 + v2; float4 v6 = v0 + v3; float4 v7 = v1 + v2; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 = native_recip(v0); v1 = native_recip(v1); v2 = native_recip(v2); v3 = native_recip(v3); v4 = native_recip(v4); v5 = native_recip(v5); v6 = native_recip(v6); v7 = native_recip(v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_rsqrt_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = v0 + v1; float4 v5 = v0 + v2; float4 v6 = v0 + v3; float4 v7 = v1 + v2; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 = native_rsqrt(v0); v1 = native_rsqrt(v1); v2 = native_rsqrt(v2); v3 = native_rsqrt(v3); v4 = native_rsqrt(v4); v5 = native_rsqrt(v5); v6 = native_rsqrt(v6); v7 = native_rsqrt(v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int64_add_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global ulong2 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); ulong2 v0 = local_a[masked_tid]; ulong2 v1 = local_a[masked_tid + 1]; ulong2 v2 = local_a[masked_tid + 2]; ulong2 v3 = local_a[masked_tid + 3]; ulong2 v4 = v0 + v1; ulong2 v5 = v0 + v2; ulong2 v6 = v0 + v3; ulong2 v7 = v1 + v2; ulong2 acc = local_a[0]; for (int i = 0; i < count; i++) { //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1]; v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int64_mul_rate_test(__global ulong2 *A, int count, __global ulong2 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global ulong2 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); ulong2 v0 = local_a[masked_tid]; ulong2 v1 = local_a[masked_tid + 1]; ulong2 v2 = local_a[masked_tid + 2]; ulong2 v3 = local_a[masked_tid + 3]; ulong2 v4 = v0 + v1; ulong2 v5 = v0 + v2; ulong2 v6 = v0 + v3; ulong2 v7 = v1 + v2; ulong2 acc = local_a[0]; for (int i = 0; i < count; i++) { //uint4 acc = local_a[i & (rate_local_mem_test_size) - 1]; v0 *= acc; v1 *= acc; v2 *= acc; v3 *= acc; v4 *= acc; v5 *= acc; v6 *= acc; v7 *= acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void mix_fp32_int32_add_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __local int4 local_a[rate_local_mem_test_size]; for (int i = tid;i < rate_local_mem_test_size; i += max_offset) local_a[i] = convert_int4_sat(A[i]); barrier(CLK_LOCAL_MEM_FENCE); int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = A[masked_tid]; float4 v1 = A[masked_tid + 1]; float4 v2 = A[masked_tid + 2]; float4 v3 = A[masked_tid + 3]; int4 v4 = convert_int4_sat(v0 + v1); int4 v5 = convert_int4_sat(v0 + v2); int4 v6 = convert_int4_sat(v0 + v3); int4 v7 = convert_int4_sat(v1 + v2); float4 fp_acc = A[0]; for (int i = 0; i < count; i++) { int4 int_acc = local_a[i & (rate_local_mem_test_size - 1)]; v0 += fp_acc; v1 += fp_acc; v2 += fp_acc; v3 += fp_acc; v4 += int_acc; v5 += int_acc; v6 += int_acc; v7 += int_acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7); } __kernel void mix_fp32_int32_addmul_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *fp32_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = fp32_a[masked_tid]; float4 v1 = fp32_a[masked_tid + 1]; float4 v2 = fp32_a[masked_tid + 2]; float4 v3 = fp32_a[masked_tid + 3]; int4 v4 = convert_int4_sat(v0 + v1); int4 v5 = convert_int4_sat(v0 + v2); int4 v6 = convert_int4_sat(v0 + v3); int4 v7 = convert_int4_sat(v1 + v2); float4 fp_acc = fp32_a[0]; int4 int_acc = convert_int4_sat(fp32_a[0]); for (int i = 0; i < count; i++) { v0 += fp_acc; v1 += fp_acc; v2 += fp_acc; v3 += fp_acc; v4 *= int_acc; v5 *= int_acc; v6 *= int_acc; v7 *= int_acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + convert_float4(v4 + v5 + v6 + v7); } __kernel void fp32_fma_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = local_a[masked_tid + 4]; float4 v5 = local_a[masked_tid + 5]; float4 v6 = local_a[masked_tid + 6]; float4 v7 = local_a[masked_tid + 7]; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc * v0; v1 += acc * v1; v2 += acc * v2; v3 += acc * v3; v4 += acc * v4; v5 += acc * v5; v6 += acc * v6; v7 += acc * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_builtin_fma_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = local_a[masked_tid + 4]; float4 v5 = local_a[masked_tid + 5]; float4 v6 = local_a[masked_tid + 6]; float4 v7 = local_a[masked_tid + 7]; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 = fma(acc, v0, v0); v1 = fma(acc, v1, v1); v2 = fma(acc, v2, v2); v3 = fma(acc, v3, v3); v4 = fma(acc, v4, v4); v5 = fma(acc, v5, v5); v6 = fma(acc, v6, v6); v7 = fma(acc, v7, v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_mad_rate_test(__global float4 *A, int count, __global float4 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float4 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float4 v0 = local_a[masked_tid]; float4 v1 = local_a[masked_tid + 1]; float4 v2 = local_a[masked_tid + 2]; float4 v3 = local_a[masked_tid + 3]; float4 v4 = v0 + v1; float4 v5 = v0 + v2; float4 v6 = v0 + v3; float4 v7 = v1 + v2; float4 acc = local_a[0]; for (int i = 0; i < count; i++) { //float4 acc = local_a[i & (rate_local_mem_test_size) - 1]; v0 = mad(acc, v0, v0); v1 = mad(acc, v1, v1); v2 = mad(acc, v2, v2); v3 = mad(acc, v3, v3); v4 = mad(acc, v4, v4); v5 = mad(acc, v5, v5); v6 = mad(acc, v6, v6); v7 = mad(acc, v7, v7); } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int16_add_rate_test(__global short8 *A, int count, __global short8 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); //__global short8 *local_a = A; __local short8 local_a[rate_local_mem_test_size]; for (int i = tid;i < rate_local_mem_test_size; i += max_offset) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); int masked_tid = min(tid, rate_local_mem_test_size - 8); short8 v0 = local_a[masked_tid]; short8 v1 = local_a[masked_tid + 1]; short8 v2 = local_a[masked_tid + 2]; short8 v3 = local_a[masked_tid + 3]; short8 v4 = local_a[masked_tid + 4]; short8 v5 = local_a[masked_tid + 5]; short8 v6 = local_a[masked_tid + 6]; short8 v7 = local_a[masked_tid + 7]; for (int i = 0; i < count; i++) { short8 acc = local_a[i & (rate_local_mem_test_size - 1)]; v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int16_mul_rate_test(__global short8 *A, int count, __global short8 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); //__global short8 *local_a = A; __local short8 local_a[rate_local_mem_test_size]; for (int i = tid;i < rate_local_mem_test_size; i += max_offset) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); int masked_tid = min(tid, rate_local_mem_test_size - 8); short8 v0 = local_a[masked_tid]; short8 v1 = local_a[masked_tid + 1]; short8 v2 = local_a[masked_tid + 2]; short8 v3 = local_a[masked_tid + 3]; short8 v4 = local_a[masked_tid + 4]; short8 v5 = local_a[masked_tid + 5]; short8 v6 = local_a[masked_tid + 6]; short8 v7 = local_a[masked_tid + 7]; for (int i = 0; i < count; i++) { short8 acc = local_a[i & (rate_local_mem_test_size - 1)]; v0 *= acc; v1 *= acc; v2 *= acc; v3 *= acc; v4 *= acc; v5 *= acc; v6 *= acc; v7 *= acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int8_add_rate_test(__global char16 *A, int count, __global char16 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global char16 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); char16 v0 = local_a[masked_tid]; char16 v1 = local_a[masked_tid + 1]; char16 v2 = local_a[masked_tid + 2]; char16 v3 = local_a[masked_tid + 3]; char16 v4 = v0 + v1; char16 v5 = v0 + v2; char16 v6 = v0 + v3; char16 v7 = v1 + v2; char16 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int8_mul_rate_test(__global char16 *A, int count, __global char16 *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global char16 *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); char16 v0 = local_a[masked_tid]; char16 v1 = local_a[masked_tid + 1]; char16 v2 = local_a[masked_tid + 2]; char16 v3 = local_a[masked_tid + 3]; char16 v4 = v0 + v1; char16 v5 = v0 + v2; char16 v6 = v0 + v3; char16 v7 = v1 + v2; char16 acc = local_a[0]; for (int i = 0; i < count; i++) { v0 *= acc; v1 *= acc; v2 *= acc; v3 *= acc; v4 *= acc; v5 *= acc; v6 *= acc; v7 *= acc; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_fma_latency_test(__global float *A, int count, __global float *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_add_latency_test(__global float *A, int count, __global float *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int32_add_latency_test(__global uint *A, int count, __global uint *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); int masked_tid = tid & (rate_local_mem_test_size - 1); uint v0 = A[masked_tid]; uint v1 = A[masked_tid + 1]; uint v2 = A[masked_tid + 2]; uint v3 = A[masked_tid + 3]; uint v4 = v0 + v1; uint v5 = v0 + v2; uint v6 = v0 + v3; uint v7 = v1 + v2; for (int i = 0; i < count; i += 4) { v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int32_mul_latency_test(__global uint *A, int count, __global uint *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global uint *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); uint v0 = local_a[masked_tid]; uint v1 = local_a[masked_tid + 1]; uint v2 = local_a[masked_tid + 2]; uint v3 = local_a[masked_tid + 3]; uint v4 = v0 + v1; uint v5 = v0 + v2; uint v6 = v0 + v3; uint v7 = v1 + v2; uint acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_divergence_rate_test(__global float *A, int count, __global float *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = A[0]; float op = A[get_global_id(0)]; if (op < 1.0) { for (int i = 0; i < count; i++) { if (op < 0.5) { v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } else { v0 *= acc; v1 *= acc; v2 *= acc; v3 *= acc; v4 *= acc; v5 *= acc; v6 *= acc; v7 *= acc; } } } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_partition_rate_test(__global float *A, int count, __global float *ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float *local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = A[0]; float op = A[get_global_id(0)]; if (op < 1.0) { for (int i = 0; i < count; i++) { v0 += acc; v1 += acc; v2 += acc; v3 += acc; v4 += acc; v5 += acc; v6 += acc; v7 += acc; } } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } /// Scalar latency __kernel void int32_add_scalar_latency_test(__global uint* A, int count, __global uint* ret) { int tid = 0; int max_offset = get_local_size(0); int masked_tid = tid & (rate_local_mem_test_size - 1); uint v0 = A[masked_tid]; uint v1 = A[masked_tid + 1]; uint v2 = A[masked_tid + 2]; uint v3 = A[masked_tid + 3]; uint v4 = v0 + v1; uint v5 = v0 + v2; uint v6 = v0 + v3; uint v7 = v1 + v2; for (int i = 0; i < count; i += 4) { v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void int32_mul_scalar_latency_test(__global uint* A, int count, __global uint* ret) { int tid = 0; int max_offset = get_local_size(0); __global uint* local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); uint v0 = local_a[masked_tid]; uint v1 = local_a[masked_tid + 1]; uint v2 = local_a[masked_tid + 2]; uint v3 = local_a[masked_tid + 3]; uint v4 = v0 + v1; uint v5 = v0 + v2; uint v6 = v0 + v3; uint v7 = v1 + v2; uint acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_add_scalar_latency_test(__global float* A, int count, __global float* ret) { int tid = 0; int max_offset = get_local_size(0); __global float* local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 8) { v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; v0 = v7 + v0; v1 = v0 + v1; v2 = v1 + v2; v3 = v2 + v3; v4 = v3 + v4; v5 = v4 + v5; v6 = v5 + v6; v7 = v6 + v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_fma_scalar_latency_test(__global float* A, int count, __global float* ret) { int tid = 0; int max_offset = get_local_size(0); __global float* local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; v0 = v7 + acc * v0; v1 = v0 + acc * v1; v2 = v1 + acc * v2; v3 = v2 + acc * v3; v4 = v3 + acc * v4; v5 = v4 + acc * v5; v6 = v5 + acc * v6; v7 = v6 + acc * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_mul_scalar_latency_test(__global float* A, int count, __global float* ret) { int tid = 0; int max_offset = get_local_size(0); __global float* local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } __kernel void fp32_mul_latency_test(__global float* A, int count, __global float* ret) { int tid = get_local_id(0); int max_offset = get_local_size(0); __global float* local_a = A; int masked_tid = tid & (rate_local_mem_test_size - 1); float v0 = local_a[masked_tid]; float v1 = local_a[masked_tid + 1]; float v2 = local_a[masked_tid + 2]; float v3 = local_a[masked_tid + 3]; float v4 = v0 + v1; float v5 = v0 + v2; float v6 = v0 + v3; float v7 = v1 + v2; float acc = local_a[0]; for (int i = 0; i < count; i += 4) { v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; v0 = v7 * v0; v1 = v0 * v1; v2 = v1 * v2; v3 = v2 * v3; v4 = v3 * v4; v5 = v4 * v5; v6 = v5 * v6; v7 = v6 * v7; } ret[get_global_id(0)] = v0 + v1 + v2 + v3 + v4 + v5 + v6 + v7; } ================================================ FILE: GpuMemLatency/kernel.cl ================================================ // not used, I tried __constant sampler_t direct_sampler = CLK_NORMALIZED_COORDS_FALSE | // coordinates are from 0 to max dimension size CLK_ADDRESS_NONE | // if it goes out of bounds feel free to explode and die CLK_FILTER_NEAREST; __kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) { int localId = get_local_id(0); // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0; uint4 current = read_imageui(A, startPos); // printf("start x: %u -> %u\n", startPos, current.x); for (int i = 0; i < count; i += 10) { // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4); //current = read_imageui(A, direct_sampler, i); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w); // local_a[localId] = current; } ret[get_global_id(0)] = current.x; } __constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float) CLK_ADDRESS_REPEAT | // going out of bounds = replicate CLK_FILTER_NEAREST; __kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) { int localId = get_local_id(0); float pos = get_global_id(0) * native_recip((float)get_global_size(0)); float2 increment; increment.x = 0.01; // guessing increment.y = 0.01; float2 current0, current1, current2, current3; current0.x = pos; current0.y = pos; current1.x = 0.1 + (localId / 10000); current1.y = 0.1 + (localId / 10000); current2.x = 0.01 + (localId / 10000); current2.y = 0.01 + (localId / 10000); current3.x = 0.002 + (localId / 5000); current3.y = 0.001 + (localId / 5000); float4 tmp0 = read_imagef(A, funny_sampler, current0); float4 tmp1 = read_imagef(A, funny_sampler, current1); float4 tmp2 = read_imagef(A, funny_sampler, current2); float4 tmp3 = read_imagef(A, funny_sampler, current3); for (int i = 0; i < count; i += 4) { tmp0 += read_imagef(A, funny_sampler, current0); tmp1 += read_imagef(A, funny_sampler, current1); tmp2 += read_imagef(A, funny_sampler, current2); tmp3 += read_imagef(A, funny_sampler, current3); current0 += increment; current1 += increment; current2 += increment; current3 += increment; } *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3); } // Cacheline size in bytes, must correspond to what's defined for the latency test #define CACHELINE_SIZE 64 // unrolled until terascale no longer saw further improvement (10x unroll) // assumes count will be a multiple of 10. but it won't be too inaccurate with a big count // not divisible by 10 __kernel void unrolled_latency_test(__global const int* A, int count, __global int* ret) { int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency int result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } // Ensures the loaded value will be constant across a workgroup __kernel void scalar_unrolled_latency_test(__global const int* A, int count, __global int* ret) { int current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0]; int result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } // Takes size as an additional argument, meant to run many pointer chasing threads in parallel // Tries to measure a GPU's latency hiding ability at varying levels of parallelism __kernel void parallel_latency_test(__global const int* A, int count, int size, __global int* ret) { size_t threadId = get_global_id(0); int current = A[threadId % size]; int result = 0; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[threadId] = result; } // latency test like the unrolled one above, but with input as constant memory __kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) { //int current = A[0]; int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; int result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } #define local_mem_test_size 1024 // uses local memory (LDS/shmem) __kernel void local_unrolled_latency_test(__global const int* A, int count, __global int* ret) { __local int local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite? // better be fast for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // everyone else can chill/get masked off if (get_local_id(0) == 0) { int current = local_a[0]; int result; for (int i = 0; i < count; i += 10) { result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; } ret[0] = result; } } __kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float4 result1 = (0.1f,0.2f,0.3f,0.4f); float4 result2 = (1.1f,1.2f,1.3f,1.4f); float4 result3 = (2.1f,2.2f,2.3f,2.4f); float4 result4 = (3.0f,3.1f,3.2f,3.3f); float4 result5 = (4.0f,4.2f,4.1f,4.3f); int initialIdx = startPositions[threadId]; //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1); //startPositions[threadId] = initialIdx; // for debugging int idx = initialIdx; __global float4 *B = (__global float4 *)A; for (int i = 0; i < count; i += 20) { result1 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result2 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result3 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result4 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result5 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; } ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5); } #define local_mem_bw_test_size 1024 // test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats __kernel void local_bw_test(__global float* A, uint count, __global float* ret) { __local float local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float acc1 = 1.1; float acc2 = 2.2; float acc3 = 3.3; float acc4 = 4.4; //printf("subgroup size %d\n", get_sub_group_size()); // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += 12) { acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1]; acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2]; acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } ret[threadId] = acc1 + acc2 + acc3 + acc4; } __kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) { __local float4 local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float4 acc1 = A[get_global_id(0) & 0x3FF]; float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF]; float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF]; float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF]; // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += (12*4)) { acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; acc2 += local_a[idx0 + 1] * local_a[idx1 + 1] + local_a[idx2 + 1]; acc3 += local_a[idx0 + 2] * local_a[idx1 + 2] + local_a[idx2 + 2]; acc4 += local_a[idx0 + 3] * local_a[idx1 + 3] + local_a[idx2 + 3]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4); } #define local64_test_size 2048 // size was given in 4B elements. This test uses 8B __kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { __local ulong local_a[local64_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; // assumes local memory size is at least 512x 64-bit uints int idx0 = localId; int idx1 = localId + localSize; for (int i = 0; i < count; i += 8) { acc0 ^= local_a[idx0]; acc1 ^= local_a[idx1]; acc2 ^= local_a[idx0 + 1]; acc3 ^= local_a[idx1 + 1]; idx0 = (idx0 + localSize) & 0x1FF; idx1 = (idx1 + localSize) & 0x1FF; } ret[threadId] = acc0 + acc1 + acc2 + acc3; } // let's try the method from zhe jia et al __kernel void local_chase_bw(__global uint* A, uint count, __global uint* ret) { __local ulong local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); uint sink = localId; // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); for (int i = 0; i < count; i += 4) { sink = local_a[sink]; sink = local_a[sink]; sink = local_a[sink]; sink = local_a[sink]; } ret[threadId] = sink; } #define fixed_tex_test_size 1024 __kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); uint4 acc1 = read_imageui(A, 0); uint4 acc2 = read_imageui(A, 1); uint4 acc3 = read_imageui(A, 2); uint4 acc4 = read_imageui(A, 3); int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; // Each read_imageui reads out a 4-wide vector for (int i = 0; i < count; i += 16) { read_imageui(A, idx0); acc1 += read_imageui(A, idx0); acc2 += read_imageui(A, idx1); acc3 += read_imageui(A, idx2); acc4 += read_imageui(A, idx0 + 1); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } float4 out1 = convert_float4(acc1); float4 out2 = convert_float4(acc2); float4 out3 = convert_float4(acc3); float4 out4 = convert_float4(acc4); ret[threadId] = dot(out1, out2) + dot(out3, out4); } // A = inputs, fixed size __kernel void int_exec_latency_test(__global int* A, int count, __global int* ret) { int sum = 0; int input1 = A[0], input2 = A[1], input3 = A[2], input4 = A[3]; for (int i = 0; i < count; i++) { sum += input1; sum += input2; sum += input3; sum += input4; sum += input1; sum += input2; sum += input3; sum += input4; sum += input1; sum += input2; sum += input3; sum += input4; } } // hoping each thread/workgroup lands on a different CU // A = pointer to location being bounced around // count = iterations // ret = sink // t1 = id of thread 1 // t2 = id of thread 2 __kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) { int global_id = get_global_id(0); int current = 0; if (global_id == t1) current = 1; else if (global_id == t2) current = 2; if (global_id == t1 || global_id == t2) { //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current); while (current <= 2 * count) { if (atomic_cmpxchg(A, current - 1, current) == current - 1) { current += 2; } } ret[0] = current; } } __kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) { int current = get_global_id(0) + 1; while (current <= 2 * count) { if (atomic_cmpxchg(A, current - 1, current) == current - 1) { current += 2; } } } __kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) { __local int a[1]; int current = get_global_id(0) + 1; if (current == 1) a[0] = A[0]; barrier(CLK_LOCAL_MEM_FENCE); while (current <= 2 * count) { if (atomic_cmpxchg(a, current - 1, current) == current - 1) { current += 2; } } } __kernel void dummy_add(__global int* A) { A[get_global_id(0)]++; } ================================================ FILE: GpuMemLatency/kernels/atomic_exec_latency_test.cl ================================================ __kernel void atomic_exec_latency_test(__global int* A, int count, __global int* ret) { int current = get_global_id(0) + 1; while (current <= 2 * count) { if (atomic_cmpxchg(A, current - 1, current) == current - 1) { current += 2; } } } __kernel void atomic_add_test(__global int *A, int count) { int addend = get_global_id(0); int addend1 = addend + 5; int addend2 = addend + 6; int addend3 = addend + 7; int addend4 = addend + 8; int addend5 = addend + 9; int addend6 = addend + 10; int addend7 = addend + 11; __global int *target = A + get_global_id(0); for (int i = 0; i < count; i++) { atomic_add(target, addend); atomic_add(target, addend1); atomic_add(target, addend2); atomic_add(target, addend3); atomic_add(target, addend4); atomic_add(target, addend5); atomic_add(target, addend6); atomic_add(target, addend7); } } ================================================ FILE: GpuMemLatency/kernels/buffer_bw_test.cl ================================================ #define fixed_tex_test_size 1024 __kernel void buffer_bw_test(__read_only image1d_buffer_t A, uint count, __global float* ret) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); uint4 acc1 = read_imageui(A, 0); uint4 acc2 = read_imageui(A, 1); uint4 acc3 = read_imageui(A, 2); uint4 acc4 = read_imageui(A, 3); int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; // Each read_imageui reads out a 4-wide vector for (int i = 0; i < count; i += 16) { read_imageui(A, idx0); acc1 += read_imageui(A, idx0); acc2 += read_imageui(A, idx1); acc3 += read_imageui(A, idx2); acc4 += read_imageui(A, idx0 + 1); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } float4 out1 = convert_float4(acc1); float4 out2 = convert_float4(acc2); float4 out3 = convert_float4(acc3); float4 out4 = convert_float4(acc4); ret[threadId] = dot(out1, out2) + dot(out3, out4); } ================================================ FILE: GpuMemLatency/kernels/c2c_atomic_exec_latency_test.cl ================================================ // hoping each thread/workgroup lands on a different CU // A = pointer to location being bounced around // count = iterations // ret = sink // t1 = id of thread 1 // t2 = id of thread 2 __kernel void c2c_atomic_exec_latency_test(__global int* A, int count, __global int* ret, int t1, int t2) { int global_id = get_global_id(0); int current = 0; if (global_id == t1) current = 1; else if (global_id == t2) current = 2; if (global_id == t1 || global_id == t2) { //printf("gid: %d, t1: %d, t2: %d, A: %d, current = %d\n", global_id, t1, t2, *A, current); while (current <= 2 * count) { if (atomic_cmpxchg(A, current - 1, current) == current - 1) { current += 2; } } ret[0] = current; } } ================================================ FILE: GpuMemLatency/kernels/constant_unrolled_latency_test.cl ================================================ // latency test like the unrolled one above, but with input as constant memory __kernel void constant_unrolled_latency_test(__constant const int* A, int count, __global int* ret) { //int current = A[0]; int current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; int result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } ================================================ FILE: GpuMemLatency/kernels/ldst_bw_test.cl ================================================ #define ldst_bw_test_size 1024 // test load/store bandwidth with a small test size that should fit in L1 /*__kernel void ldst_bw_test(__global float* A, uint count, __global float* ret) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float acc1 = 1.1; float acc2 = 2.2; float acc3 = 3.3; float acc4 = 4.4; // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += 12) { acc1 += A[idx0] * A[idx1] + A[idx2]; idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size); idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size); idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size); acc2 += A[idx0] * A[idx1] + A[idx2]; idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size); idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size); idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size); acc3 += A[idx0] * A[idx1] + A[idx2]; idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size); idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size); idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size); acc4 += A[idx0] * A[idx1] + A[idx2]; idx0 = clamp(idx0 + localSize, 0, ldst_bw_test_size); idx1 = clamp(idx1 + localSize, 0, ldst_bw_test_size); idx2 = clamp(idx2 + localSize, 0, ldst_bw_test_size); } ret[threadId] = acc1 + acc2 + acc3 + acc4; }*/ __kernel void ldst_bw_test(__global float4* A, uint count, __global float* ret) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float acc1 = 1.1; float acc2 = 2.2; float acc3 = 3.3; float acc4 = 4.4; // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = idx0 + localSize; int idx2 = idx1 + localSize; int idx3 = idx2 + localSize; for (int i = 0; i < count; i += (16*4)) { acc1 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; idx3 = (idx3 + localSize) & 0x3FF; acc2 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; idx3 = (idx3 + localSize) & 0x3FF; acc3 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; idx3 = (idx3 + localSize) & 0x3FF; acc4 += dot(A[idx0], A[idx1]) + dot(A[idx2], A[idx3]); idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; idx3 = (idx3 + localSize) & 0x3FF; } ret[threadId] = acc1 + acc2 + acc3 + acc4; } ================================================ FILE: GpuMemLatency/kernels/local_64_bw_test.cl ================================================ #define local64_test_size 2048 // size was given in 4B elements. This test uses 8B __kernel void local_64_bw_test(__global ulong* A, uint count, __global ulong* ret) { __local ulong local_a[local64_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local64_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); ulong acc0 = 0, acc1 = 0, acc2 = 0, acc3 = 0; // assumes local memory size is at least 512x 64-bit uints int idx0 = localId; int idx1 = localId + localSize; for (int i = 0; i < count; i += 8) { acc0 ^= local_a[idx0]; acc1 ^= local_a[idx1]; idx0 = (idx0 + localSize) & 0x1FF; idx1 = (idx1 + localSize) & 0x1FF; acc3 ^= local_a[idx0]; acc4 ^= local_a[idx1]; idx0 = (idx0 + localSize) & 0x1FF; idx1 = (idx1 + localSize) & 0x1FF; } ret[threadId] = acc0 + acc1 + acc2 + acc3; } ================================================ FILE: GpuMemLatency/kernels/local_atomic_latency_test.cl ================================================ __kernel void local_atomic_latency_test(__global int* A, int count, __global int* ret) { __local int a[1]; int current = get_global_id(0) + 1; if (current == 1) a[0] = A[0]; barrier(CLK_LOCAL_MEM_FENCE); while (current <= 2 * count) { if (atomic_cmpxchg(a, current - 1, current) == current - 1) { current += 2; } } } #define local_atomic_add_wg_size 256 __kernel void local_atomic_add_test(__global int *A, int count) { __local int local_a[local_atomic_add_wg_size]; local_a[get_local_id(0)] = A[get_global_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); int addend = get_global_id(0); int addend1 = addend + 5; int addend2 = addend + 6; int addend3 = addend + 7; int addend4 = addend + 8; int addend5 = addend + 9; int addend6 = addend + 10; int addend7 = addend + 11; __local int *target = local_a + get_local_id(0); for (int i = 0; i < count; i++) { atomic_add(target, addend); atomic_add(target, addend1); atomic_add(target, addend2); atomic_add(target, addend3); atomic_add(target, addend4); atomic_add(target, addend5); atomic_add(target, addend6); atomic_add(target, addend7); } A[get_global_id(0)] = local_a[get_local_id(0)]; } ================================================ FILE: GpuMemLatency/kernels/local_bw_test.cl ================================================ #define local_mem_bw_test_size 1024 // test bandwidth with local memory. A must be at least local_mem_bw_test_size in floats __kernel void local_bw_test(__global float* A, uint count, __global float* ret) { __local float local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float acc1 = 1.1; float acc2 = 2.2; float acc3 = 3.3; float acc4 = 4.4; //printf("subgroup size %d\n", get_sub_group_size()); // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += 12) { acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } ret[threadId] = acc1 + acc2 + acc3 + acc4; } ================================================ FILE: GpuMemLatency/kernels/local_float4_bw_test.cl ================================================ #define local_mem_bw_test_size 1024 __kernel void local_float4_bw_test(__global float4* A, uint count, __global float* ret) { __local float4 local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float4 acc1 = A[get_global_id(0) & 0x3FF]; float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF]; float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF]; float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF]; // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0); i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += (12 * 4)) { acc1 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc2 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc3 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; acc4 += local_a[idx0] * local_a[idx1] + local_a[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } ret[threadId] = dot(acc1, acc2) + dot(acc3, acc4); } __kernel void mixed_float4_bw_test(__global float4* A, uint count, __global float* ret) { __local float4 local_a[local_mem_bw_test_size]; int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float4 acc1 = A[get_global_id(0) & 0x3FF]; float4 acc2 = A[(get_global_id(0) + 1) & 0x3FF]; float4 acc3 = A[(get_global_id(0) + 2) & 0x3FF]; float4 acc4 = A[(get_global_id(0) + 3) & 0x3FF]; float4 acc5 = A[(get_global_id(0) + 4) & 0x3FF]; float4 acc6 = A[(get_global_id(0) + 5) & 0x3FF]; float4 acc7 = A[(get_global_id(0) + 6) & 0x3FF]; float4 acc8 = A[(get_global_id(0) + 7) & 0x3FF]; // workgroup-wide copy from global mem into local mem for (int i = get_local_id(0);i < local_mem_bw_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // assumes local memory size is at least 1024 float4s int idx0 = localId; int idx1 = localId + localSize; int idx2 = localId + localSize * 2; for (int i = 0; i < count; i += (16*4)) { local_a[idx0] += A[idx1] * A[idx2]; // 4 * (3R 1W) idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; local_a[idx0] += A[idx1] * A[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; local_a[idx0] += A[idx1] * A[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; local_a[idx0] += A[idx1] * A[idx2]; idx0 = (idx0 + localSize) & 0x3FF; idx1 = (idx1 + localSize) & 0x3FF; idx2 = (idx2 + localSize) & 0x3FF; } ret[threadId] = dot(local_a[get_local_id(0)], local_a[get_local_id(0) + 1]); } ================================================ FILE: GpuMemLatency/kernels/local_unrolled_latency_test.cl ================================================ #define local_mem_test_size 1024 // uses local memory (LDS/shmem) __kernel void local_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) { __local uint local_a[local_mem_test_size]; // 4 KB, should be present on all GPUs, amirite? // better be fast for (int i = get_local_id(0);i < local_mem_test_size; i += get_local_size(0)) local_a[i] = A[i]; barrier(CLK_LOCAL_MEM_FENCE); // everyone else can chill/get masked off if (get_local_id(0) == 0) { uint current = local_a[0]; uint result; for (int i = 0; i < count; i += 10) { result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; result += current; current = local_a[current]; } ret[0] = result; } } ================================================ FILE: GpuMemLatency/kernels/scalar_unrolled_latency_test.cl ================================================ // Ensures the loaded value will be constant across a workgroup __kernel void scalar_unrolled_latency_test(__global const uint* A, int count, __global uint* ret) { uint current = get_num_groups(0) > 1 ? ret[get_group_id(0) * get_local_size(0)]: A[0]; uint result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } ================================================ FILE: GpuMemLatency/kernels/sum_bw_test.cl ================================================ __kernel void sum_bw_test(__global float* A, uint count, uint float4size, __global float* ret, uint skip, __global uint *startPositions) { int threadId = get_global_id(0); int localId = get_local_id(0); int localSize = get_local_size(0); int groupId = get_group_id(0); float4 result1 = (0.1f,0.2f,0.3f,0.4f); float4 result2 = (1.1f,1.2f,1.3f,1.4f); float4 result3 = (2.1f,2.2f,2.3f,2.4f); float4 result4 = (3.0f,3.1f,3.2f,3.3f); float4 result5 = (4.0f,4.2f,4.1f,4.3f); int initialIdx = startPositions[threadId]; //int initialIdx = (groupId * skip * localSize + localId) % (float4size - 1); //startPositions[threadId] = initialIdx; // for debugging int idx = initialIdx; __global float4 *B = (__global float4 *)A; for (int i = 0; i < count; i += 20) { result1 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result2 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result3 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result4 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; result5 += B[idx]; idx += localSize; if (idx >= float4size) idx = initialIdx; } ret[threadId] = dot(result1, result2) + dot(result3, result4) + dot(result4, result5); } ================================================ FILE: GpuMemLatency/kernels/tex_bw_test.cl ================================================ __constant sampler_t funny_sampler = CLK_NORMALIZED_COORDS_TRUE | // coordinates are from 0 to 1 (float) CLK_ADDRESS_REPEAT | // going out of bounds = replicate CLK_FILTER_NEAREST; __kernel void tex_bw_test(__read_only image2d_t A, int count, __global float* ret) { int localId = get_local_id(0); float pos = get_global_id(0) * native_recip((float)get_global_size(0)); float2 increment; increment.x = 0.01; // guessing increment.y = 0.01; float2 current0, current1, current2, current3; current0.x = pos; current0.y = pos; current1.x = 0.1 + (localId / 10000); current1.y = 0.1 + (localId / 10000); current2.x = 0.01 + (localId / 10000); current2.y = 0.01 + (localId / 10000); current3.x = 0.002 + (localId / 5000); current3.y = 0.001 + (localId / 5000); float4 tmp0 = read_imagef(A, funny_sampler, current0); float4 tmp1 = read_imagef(A, funny_sampler, current1); float4 tmp2 = read_imagef(A, funny_sampler, current2); float4 tmp3 = read_imagef(A, funny_sampler, current3); for (int i = 0; i < count; i += 4) { tmp0 += read_imagef(A, funny_sampler, current0); tmp1 += read_imagef(A, funny_sampler, current1); tmp2 += read_imagef(A, funny_sampler, current2); tmp3 += read_imagef(A, funny_sampler, current3); current0 += increment; current1 += increment; current2 += increment; current3 += increment; } *ret = dot(tmp0, tmp1) + dot(tmp2, tmp3); } ================================================ FILE: GpuMemLatency/kernels/tex_latency_test.cl ================================================ __kernel void tex_latency_test(__read_only image1d_buffer_t A, int count, __global int* ret, int list_size) { int localId = get_local_id(0); // uint4 current = read_imageui(A, direct_sampler, 0); // using sampler screws things up int startPos = get_global_size(0) > 1 ? ret[get_global_id(0)] : 0; uint4 current = read_imageui(A, startPos); // printf("start x: %u -> %u\n", startPos, current.x); for (int i = 0; i < count; i += 10) { // printf("current: %u %u %u %u, address: %d\n", current.x, current.y, current.z, current.w, (int)current.x / 4); //current = read_imageui(A, direct_sampler, i); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); current = read_imageui(A, current.x); //printf("%d: current read: %u %u %u %u\n", i, current.x, current.y, current.z, current.w); // local_a[localId] = current; } ret[get_global_id(0)] = current.x; } ================================================ FILE: GpuMemLatency/kernels/unrolled_latency_test.cl ================================================ // unrolled until terascale no longer saw further improvement (10x unroll) // assumes count will be a multiple of 10. but it won't be too inaccurate with a big count // not divisible by 10 __kernel void unrolled_latency_test(__global const uint* A, int count, __global uint* ret) { uint current = get_global_size(0) > 1 ? ret[get_global_id(0)]: A[0]; // this will test vector latency on AMD. Set to A[0] for scalar latency uint result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; } ret[0] = result; } ================================================ FILE: GpuMemLatency/latency_test.c ================================================ #include "opencltest.h" // list_size = number of 4B (32-bit) elements float latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t list_size, uint32_t chase_iterations, short uniform, int threads, int local_size, int wave_size, int stride, uint32_t *elapsed_ms) { size_t global_item_size = 1, local_item_size = 1; cl_int ret; float latency; int64_t time_diff_ms; uint32_t result; if (threads && local_size) { local_item_size = local_size; global_item_size = threads; } // fprintf(stderr, "Testing latency with %d threads %d local size %d list size\n", threads, local_size, list_size); // Sanity Checks if (!uniform && ((stride * 2 > list_size * 4) || // 2 cache lines ((threads > 1) && (stride * 2 > (list_size * 4 / (threads / wave_size)))))) // handle partition case { fprintf(stderr, "Less than 2 lines will be visited with stride %d, list size %dx 32-bit INTs\n", stride, list_size); return 1.0f; } // Fill pattern arr uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size); uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size)); memset(A, 0, sizeof(uint32_t) * list_size); if (threads < 2 || uniform) { FillPatternArr(A, list_size, stride); thread_start[0] = 0; } else { if (wave_size <= 1) wave_size = 1; // partition pattern arr, creating a section for each wave int wave_count = threads / wave_size; int sub_list_size = list_size / wave_count; for (int waveId = 0; waveId < wave_count; waveId++) { int waveId_start = sub_list_size * waveId; thread_start[wave_size * waveId] = waveId_start; FillPatternArr(A + waveId_start, sub_list_size, stride); // fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start); // offset indices for (int subIdx = 0; subIdx < sub_list_size; subIdx++) { A[waveId_start + subIdx] += waveId_start; } } // make sure all threads in a wave access the same item for (int i = 1; i < threads; i++) { int waveId = i / wave_size; thread_start[i] = thread_start[waveId * wave_size]; //fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]); } } // copy array to device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret); clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret); clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL); clFinish(command_queue); // Set kernel arguments ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to set list as kernel arg. clSetKernelArg returned %d\n", ret); latency = 0; goto cleanup; } ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); start_timing(); // Execute the OpenCL kernel. launch a single thread ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; goto cleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); latency = 0; goto cleanup; } time_diff_ms = end_timing(); if (elapsed_ms != NULL) *elapsed_ms = time_diff_ms; latency = 1e6 * (float)time_diff_ms / (float)chase_iterations; ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(uint32_t), &result, 0, NULL, NULL); clFinish(command_queue); //fprintf(stderr, "Finished reading result. Sum: %d\n", result[0]); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); free(A); return latency; } float tex_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t list_size, uint32_t chase_iterations, int threads, int local_size, int wave_size) { size_t global_item_size = 1, local_item_size = 1; cl_int ret = 0; uint32_t result; cl_mem a_mem_obj = NULL, result_obj = NULL, tex_obj = NULL; float latency = 0; if (threads > 1) { global_item_size = threads; local_item_size = local_size; } uint32_t* A = (uint32_t*)malloc(sizeof(uint32_t) * list_size); uint32_t* thread_start = (uint32_t*)malloc(sizeof(uint32_t) * (global_item_size)); memset(A, 0, sizeof(uint32_t) * list_size); if (threads < 2) { FillPatternArr(A, list_size, CACHELINE_SIZE); thread_start[0] = 0; } else { if (wave_size <= 1) wave_size = 1; // partition pattern arr, creating a section for each wave int wave_count = threads / wave_size; int sub_list_size = list_size / wave_count; for (int waveId = 0; waveId < wave_count; waveId++) { int waveId_start = sub_list_size * waveId; thread_start[wave_size * waveId] = waveId_start; FillPatternArr(A + waveId_start, sub_list_size, CACHELINE_SIZE); // fprintf(stderr, "starting thread %d at %d\n", threadId, threadId_start); // offset indices for (int subIdx = 0; subIdx < sub_list_size; subIdx++) { A[waveId_start + subIdx] += waveId_start; } } // make sure all threads in a wave access the same item for (int i = 1; i < threads; i++) { int waveId = i / wave_size; thread_start[i] = thread_start[waveId * wave_size]; //fprintf(stderr, "wave %d thread %d starting at %d\n", waveId, i, thread_start[i]); } } // use buffer as texture a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, list_size * sizeof(uint32_t), NULL, &ret); clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, list_size * sizeof(uint32_t), A, 0, NULL, NULL); clFinish(command_queue); cl_image_format imageFormat; imageFormat.image_channel_data_type = CL_UNSIGNED_INT32; imageFormat.image_channel_order = CL_R; cl_image_desc imageDesc; memset(&imageDesc, 0, sizeof(cl_image_desc)); imageDesc.buffer = a_mem_obj; imageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; imageDesc.image_width = list_size; // width in pixels //imageDesc.image_height = 1; // not used for 1D image //imageDesc.image_depth = 1; // not used for 1D image //imageDesc.mem_object = a_mem_obj; tex_obj = clCreateImage(context, CL_MEM_READ_ONLY, &imageFormat, &imageDesc, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to create image: %d\n", ret); goto texLatencyCleanup; } size_t origin[] = { 0, 0, 0 }; size_t region[] = { imageDesc.image_width, 1, 1 }; ret = clEnqueueWriteImage(command_queue, tex_obj, CL_TRUE, origin, region, 0, 0, A, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to copy image: %d\n", ret); goto texLatencyCleanup; } result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, global_item_size * sizeof(uint32_t), NULL, &ret); clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL); clFinish(command_queue); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&tex_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&chase_iterations); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&list_size); start_timing(); // Execute the OpenCL kernel ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; goto texLatencyCleanup; } ret = clFinish(command_queue); // returns success even when TDR happens? if (ret != CL_SUCCESS) { printf("Failed to finish command queue. clFinish returned %d\n", ret); latency = 0; goto texLatencyCleanup; } uint64_t time_diff_ms = end_timing(); latency = 1e6 * (float)time_diff_ms / (float)chase_iterations; ret = clEnqueueReadBuffer(command_queue, result_obj, CL_TRUE, 0, global_item_size * sizeof(uint32_t), thread_start, 0, NULL, NULL); clFinish(command_queue); // for (int i = 0; i < global_item_size; i++) fprintf(stderr, "Thread %d ended at %d\n", i, thread_start[i]); texLatencyCleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(tex_obj); clReleaseMemObject(result_obj); free(A); return latency; } ================================================ FILE: GpuMemLatency/local_mem_latency_kernel.cl ================================================ // for testing total local memory capacity by seeing when threads can no longer overlap in time // due to local mem capacity limits across the GPU // calling code expected to define LATENCY_LOCAL_MEM_SIZE __kernel void unrolled_latency_test_localmem(__global const int* A, int count, __global int* ret) { __local int local_a[LATENCY_LOCAL_MEM_SIZE]; int start = A[0]; // this will test scalar latency, always int current = A[start]; int result; for (int i = 0; i < count; i += 10) { result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; result += current; current = A[current]; local_a[i & (LATENCY_LOCAL_MEM_SIZE - 1)] = current; } ret[0] = local_a[current & (LATENCY_LOCAL_MEM_SIZE - 1)]; } ================================================ FILE: GpuMemLatency/opencltest.c ================================================ #include "opencltest.h" // default test sizes for latency, in KB int default_test_sizes[] = { 1, 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 144, 160, 172, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, 8192, 16384, 18432, 20480, 24576, 25600, 28672, 32768, 36864, 40960, 41200, 49152, 65536, 98304, 131072, 196608, 262144, 524288, 768432, 819200, 921600, 1048576 }; // lining this up with nemes's VK bw test sizes. units for this one are in bytes const uint64_t default_bw_test_sizes[] = { 4096, 8192, 12288, 16384, 20480, 24576, 28672, 32768, 40960, 49152, 57344, 65536, 81920, 98304, 114688, 131072, 196608, 262144, 393216, 458752, 524288, 786432, 1048576, 1572864, 2097152, 3145728, 4194304, 6291456, 8388608, 12582912, 16777216, 20971520, 25165824, 33554432, 37748736, 41943040, 50331648, 58720256, 67108864, 100663296, 134217728, 201326592, 268435456, 402653184, 536870912, 805306368, 1073741824, 1610579968, 2147483648, 3221225472, 4294967296 }; float int_exec_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations); uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb); uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations); cl_ulong get_max_buffer_size(); cl_ulong get_max_constant_buffer_size(); enum TestType { VectorMemLatency, ScalarMemLatency, ConstantMemLatency, LocalMemCapacity, LocalMemLatency, TexMemLatency, GlobalAtomicLatency, LocalAtomicLatency, GlobalAtomicAdd, LocalAtomicAdd, GlobalMemBandwidth, LocalMemBandwidth, LocalMemChaseBandwidth, LocalMem64Bandwidth, LocalMemFloat4Bandwidth, MixedFloat4Bandwidth, LoadStoreBandwidth, TextureThroughput, BufferBandwidth, MemBandwidthWorkgroupScaling, CoreToCore, LinkBandwidth, InstructionRate, Divergence, Partition, MemDivergence }; int main(int argc, char* argv[]) { cl_int ret; uint32_t stride = 64; uint32_t list_size = 3840 * 2160 * 4; uint32_t chase_iterations = 1e6 * 7; // skip = 0 means auto uint32_t thread_count = 1, local_size = 1, skip = 0, wave = 0; float result; int platform_index = -1, device_index = -1; enum TestType testType = VectorMemLatency; char thread_count_set = 0, local_size_set = 0, chase_iterations_set = 0, skip_set = 0; int sizeKb = 0; int forceCuCount = 0; int forcefp16 = 0, forcefp64 = 0; // vars for local mem capacity testing int local_mem_size_kb = 0; // local mem allocated for each wg int group_count = 0; // max wg count for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char* arg = argv[argIdx] + 1; if (_strnicmp(arg, "stride", 6) == 0) { argIdx++; stride = atoi(argv[argIdx]); fprintf(stderr, "Using stride = %u\n", stride); } else if (_strnicmp(arg, "iterations", 10) == 0) { argIdx++; chase_iterations = atoi(argv[argIdx]); chase_iterations_set = 1; fprintf(stderr, "Using %u iterations\n", chase_iterations); } else if (_strnicmp(arg, "threads", 7) == 0) { argIdx++; thread_count = atoi(argv[argIdx]); thread_count_set = 1; fprintf(stderr, "Using %u threads\n", thread_count); } else if (_strnicmp(arg, "localsize", 9) == 0) { argIdx++; local_size = atoi(argv[argIdx]); local_size_set = 1; fprintf(stderr, "Using local size = %u\n", local_size); } else if (_strnicmp(arg, "wave", 4) == 0) { argIdx++; wave = atoi(argv[argIdx]); fprintf(stderr, "Estimated wave size = %u\n", wave); } else if (_strnicmp(arg, "platform", 8) == 0) { argIdx++; platform_index = atoi(argv[argIdx]); fprintf(stderr, "Using OpenCL platform index %d\n", platform_index); } else if (_strnicmp(arg, "device", 6) == 0) { argIdx++; device_index = atoi(argv[argIdx]); fprintf(stderr, "Using OpenCL device index %d\n", device_index); } else if (_strnicmp(arg, "bwskip", 6) == 0) { argIdx++; skip = atoi(argv[argIdx]); fprintf(stderr, "Workgroups will be spaced %u apart\n", skip); } else if (_strnicmp(arg, "sizekb", 6) == 0) { argIdx++; sizeKb = atoi(argv[argIdx]); fprintf(stderr, "Only testing %d KB\n", sizeKb); } else if (_strnicmp(arg, "localmemsize", 12) == 0) { argIdx++; local_mem_size_kb = atoi(argv[argIdx]); fprintf(stderr, "Testing with %d of local memory allocated per WG\n", local_mem_size_kb); } else if (_strnicmp(arg, "groupcount", 10) == 0) { argIdx++; group_count = atoi(argv[argIdx]); fprintf(stderr, "Testing with up to %d WGs\n", group_count); } else if (_strnicmp(arg, "saveprogram", 11) == 0) { saveprogram = 1; fprintf(stderr, "Writing compiled program to disk\n"); } else if (_strnicmp(arg, "forcefp16", 10) == 0) { forcefp16 = 1; fprintf(stderr, "For instruction rate testing, will run FP16 tests regardless of whether support is advertised\n"); } else if (_strnicmp(arg, "forcefp64", 10) == 0) { forcefp64 = 1; fprintf(stderr, "For instruction rate testing, will run FP64 tests regardless of whether support is advertised\n"); } else if (_strnicmp(arg, "test", 4) == 0) { argIdx++; if (_strnicmp(argv[argIdx], "vectorlatency", 13) == 0) { testType = VectorMemLatency; fprintf(stderr, "Testing global memory latency, vector accesses\n"); } else if (_strnicmp(argv[argIdx], "scalarlatency", 13) == 0) { testType = ScalarMemLatency; fprintf(stderr, "Testing global memory latency, scalar accesses\n"); } else if (_strnicmp(argv[argIdx], "constantlatency", 15) == 0) { testType = ConstantMemLatency; fprintf(stderr, "Testing constant memory latency\n"); } else if (_strnicmp(argv[argIdx], "memdivergence", 13) == 0) { testType = MemDivergence; fprintf(stderr, "Testing memory access divergence cost\n"); } else if (_strnicmp(argv[argIdx], "localmemcapacity", 16) == 0) { testType = LocalMemCapacity; fprintf(stderr, "Testing GPU-wide local memory capacity. Make sure localmemsize/groupcount are set appropriately!\n"); if (sizeKb == 0) sizeKb = 1; if (group_count == 0) group_count = 16; } else if (_strnicmp(argv[argIdx], "globalatomiccmpxchg", 19) == 0) { testType = GlobalAtomicLatency; fprintf(stderr, "Testing global atomic latency (cmpxchg)\n"); } else if (_strnicmp(argv[argIdx], "globalatomicadd", 15) == 0) { testType = GlobalAtomicAdd; fprintf(stderr, "Testing global atomic add\n"); } else if (_strnicmp(argv[argIdx], "locallatency", 13) == 0) { testType = LocalMemLatency; fprintf(stderr, "Testing local mem latency\n"); } else if (_strnicmp(argv[argIdx], "texlatency", 10) == 0) { testType = TexMemLatency; fprintf(stderr, "Testing texture mem latency\n"); } else if (_strnicmp(argv[argIdx], "localatomiccmpxchg", 18) == 0) { testType = LocalAtomicLatency; fprintf(stderr, "Testing local atomic latency (cmpxchg)\n"); } else if (_strnicmp(argv[argIdx], "localatomicadd", 14) == 0) { testType = LocalAtomicAdd; fprintf(stderr, "Testing local atomic add\n"); } else if (_strnicmp(argv[argIdx], "bw", 2) == 0) { testType = GlobalMemBandwidth; fprintf(stderr, "Testing global memory bandwidth\n"); // Somewhat reasonable defaults if (!thread_count_set) thread_count = 131072; if (!local_size_set) local_size = 256; if (!chase_iterations_set) chase_iterations = 500000; } else if (_strnicmp(argv[argIdx], "localbw", 7) == 0) { testType = LocalMemBandwidth; if (!thread_count_set) thread_count = 262144; if (!local_size_set) local_size = 256; fprintf(stderr, "Testing local memory bandwidth\n"); } else if (_strnicmp(argv[argIdx], "localchasebw", 12) == 0) { testType = LocalMemChaseBandwidth; fprintf(stderr, "Testing local memory bandwidth using pointer chasing and lots of waves\n"); } else if (_strnicmp(argv[argIdx], "local64bw", 9) == 0) { testType = LocalMem64Bandwidth; fprintf(stderr, "Testing local memory bandwidth using 64-bit loads\n"); } else if (_strnicmp(argv[argIdx], "localfloat4bw", 13) == 0) { testType = LocalMemFloat4Bandwidth; fprintf(stderr, "Testing local memory bandwidth using float4 (4x32-bit) loads\n"); } else if (_strnicmp(argv[argIdx], "mixedbw", 7) == 0) { testType = MixedFloat4Bandwidth; fprintf(stderr, "Mixed local/global load bw test with float4\n"); } else if (_strnicmp(argv[argIdx], "bufferbw", 8) == 0) { testType = BufferBandwidth; fprintf(stderr, "Testing buffer bandwidth\n"); } else if (_strnicmp(argv[argIdx], "ldstbw", 6) == 0) { testType = LoadStoreBandwidth; fprintf(stderr, "Testing load/store bandwidth\n"); } else if (_strnicmp(argv[argIdx], "scaling", 7) == 0) { testType = MemBandwidthWorkgroupScaling; fprintf(stderr, "Testing BW scaling with workgroups\n"); if (!chase_iterations_set) chase_iterations = 20000000; if (argIdx + 1 < argc && argv[argIdx + 1][0] != '-') { argIdx++; forceCuCount = atoi(argv[argIdx]); fprintf(stderr, "Using up to %d workgroups\n", forceCuCount); } } else if (_strnicmp(argv[argIdx], "c2c", 3) == 0) { testType = CoreToCore; fprintf(stderr, "Testing latency with global atomics across CU count\n"); } else if (_strnicmp(argv[argIdx], "link", 4) == 0) { testType = LinkBandwidth; fprintf(stderr, "Testing host <-> GPU link bandwidth\n"); if (!chase_iterations_set) chase_iterations = 30000000; } else if (_strnicmp(argv[argIdx], "instructionrate", 15) == 0) { testType = InstructionRate; fprintf(stderr, "Testing instruction rate\n"); if (!chase_iterations_set) chase_iterations = 1000; if (!local_size_set && !thread_count_set) { local_size = 256; thread_count = 32768; fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count); } } else if (_strnicmp(argv[argIdx], "tmu", 3) == 0) { testType = TextureThroughput; fprintf(stderr, "Testing TMUs\n"); } else if (_strnicmp(argv[argIdx], "divergence", 10) == 0) { testType = Divergence; fprintf(stderr, "Testing compute throughput with varying numbers of consecutive threads doing the same op\n"); if (!local_size_set && !thread_count_set) { local_size = 256; thread_count = 32768; fprintf(stderr, "Selecting local size = %d, threads = %d\n", local_size, thread_count); } } else if (_strnicmp(argv[argIdx], "partition", 9) == 0) { testType = Partition; fprintf(stderr, "Testing execution unit partitioning. Make sure wave size is set!\n"); } else { fprintf(stderr, "I'm so confused. Unknown test type %s\n", argv[argIdx]); } } } } if (argc == 1) { fprintf(stderr, "Usage:\n\t[-test ]\n\t[-platform ]\n\t[-device ]\n"); fprintf(stderr, "\t[-threads ]\n\t[-localsize ]\n\t[-bwskip ]\n"); fprintf(stderr, "Number of threads (OpenCL global work size) must be divisible by local work size\n"); } fprintf(stderr, "Using %d threads with local size %d\n", thread_count, local_size); #pragma region opencl_overhead // Create an OpenCL context cl_context context = get_context_from_user(platform_index, device_index); if (context == NULL) exit(1); // Load kernel cl_program program = build_program(context, "kernel.cl", NULL); if (saveprogram) write_program(program, "kernel"); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, selected_device_id, 0, &ret); fprintf(stderr, "clCreateCommandQueue returned %d\n", ret); cl_kernel c2c_atomic_latency_test_kernel = clCreateKernel(program, "c2c_atomic_exec_latency_test", &ret); cl_kernel dummy_add_kernel = clCreateKernel(program, "dummy_add", &ret); cl_kernel local_bw_chase_kernel = clCreateKernel(program, "local_chase_kernel", &ret); #pragma endregion opencl_overhead max_global_test_size = get_max_buffer_size(); if (testType == GlobalAtomicLatency) { cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL); cl_kernel atomic_latency_test_kernel = clCreateKernel(prog, "atomic_exec_latency_test", &ret); if (saveprogram) write_program(prog, "atomic_exec_latency_test"); chase_iterations = 200000; uint32_t elapsed_ms = 0, target_ms = 2000; while (elapsed_ms < target_ms / 2) { result = int_atomic_latency_test(context, command_queue, atomic_latency_test_kernel, chase_iterations, false, &elapsed_ms); fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result); chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms); } printf("global atomic latency: %f\n", result); clReleaseKernel(atomic_latency_test_kernel); clReleaseProgram(prog); } else if (testType == LocalAtomicLatency) { cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL); cl_kernel local_atomic_latency_test_kernel = clCreateKernel(prog, "local_atomic_latency_test", &ret); if (saveprogram) write_program(prog, "local_atomic_latency_test"); chase_iterations = 500000; uint32_t elapsed_ms = 0, target_ms = 2000; while (elapsed_ms < target_ms / 2) { result = int_atomic_latency_test(context, command_queue, local_atomic_latency_test_kernel, chase_iterations, true, &elapsed_ms); fprintf(stderr, "%d iterations, %u ms => %f ns\n", chase_iterations, elapsed_ms, result); chase_iterations = scale_iterations_to_target(chase_iterations, (float)elapsed_ms, (float)target_ms); } printf("local atomic latency: %f\n", result); clReleaseKernel(local_atomic_latency_test_kernel); clReleaseProgram(prog); } else if (testType == GlobalAtomicAdd) { cl_program prog = build_program(context, "atomic_exec_latency_test.cl", NULL); cl_kernel global_atomic_add_kernel = clCreateKernel(prog, "atomic_add_test", &ret); if (saveprogram) write_program(prog, "atomic_exec_latency_test"); result = int_atomic_add_test(context, command_queue, global_atomic_add_kernel, thread_count, local_size); fprintf(stderr, "Global atomic INT32 adds: %f GOPS\n", result); } else if (testType == LocalAtomicAdd) { cl_program prog = build_program(context, "local_atomic_latency_test.cl", NULL); cl_kernel local_atomic_add_kernel = clCreateKernel(prog, "local_atomic_add_test", &ret); if (saveprogram) write_program(prog, "local_atomic_latency_test"); result = int_atomic_add_test(context, command_queue, local_atomic_add_kernel, thread_count, local_size); fprintf(stderr, "Local atomic INT32 adds: %f GOPS\n", result); } else if (testType == VectorMemLatency || testType == ScalarMemLatency) { cl_program prog; cl_kernel globalMemLatencyKernel; if (testType == ScalarMemLatency) { prog = build_program(context, "scalar_unrolled_latency_test.cl", NULL); globalMemLatencyKernel = clCreateKernel(prog, "scalar_unrolled_latency_test", &ret); if (saveprogram) write_program(prog, "scalar_unrolled_latency_test"); } else // Vector mem latency { prog = build_program(context, "unrolled_latency_test.cl", NULL); globalMemLatencyKernel = clCreateKernel(prog, "unrolled_latency_test", &ret); if (saveprogram) write_program(prog, "unrolled_latency_test"); } fprintf(stderr, "Doing %d K p-chase iterations with stride %d over %d KiB region\n", chase_iterations / 1000, stride, list_size * 4 / 1024); printf("\nSattolo, global memory latency (up to %llu K) unroll:\n", max_global_test_size / 1024); for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) { if (max_global_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) { printf("%d K would exceed device's max buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_global_test_size / 1024); break; } result = latency_test(context, command_queue, globalMemLatencyKernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL); printf("%d,%f\n", default_test_sizes[size_idx], result); if (result == 0) { printf("Something went wrong, not testing anything bigger.\n"); break; } } clReleaseKernel(globalMemLatencyKernel); clReleaseProgram(prog); } else if (testType == MemDivergence) { cl_program vecProg, texProg; cl_kernel vecKernel, texKernel; fprintf(stderr, "Testing mem divergence with localsize %d, test size %d KB\n", local_size, sizeKb); // vector vecProg = build_program(context, "unrolled_latency_test.cl", NULL); if (saveprogram) write_program(vecProg, "vector_unrolled_latency_test"); vecKernel = clCreateKernel(vecProg, "unrolled_latency_test", &ret); texProg = build_program(context, "tex_latency_test.cl", NULL); texKernel = clCreateKernel(texProg, "tex_latency_test", &ret); if (saveprogram) write_program(texProg, "tex_latency_test"); float* memDivergenceResults = (float*)malloc(sizeof(float) * local_size * 2); for (int threadCount = 1; threadCount <= local_size; threadCount++) { float vecResult = latency_test(context, command_queue, vecKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), false, threadCount, threadCount, 1, stride, NULL); memDivergenceResults[threadCount * 2] = vecResult; float texResult = tex_latency_test(context, command_queue, texKernel, 256 * sizeKb, scale_iterations(sizeKb, chase_iterations), threadCount, threadCount, 1); memDivergenceResults[threadCount * 2 + 1] = texResult; fprintf(stderr, "%d threads: %f vec, %f tex\n", threadCount, vecResult, texResult); } for (int threadCount = 1; threadCount <= local_size; threadCount++) { printf("%d,%f,%f\n", threadCount, memDivergenceResults[threadCount * 2], memDivergenceResults[threadCount * 2 + 1]); } clReleaseKernel(texKernel); clReleaseKernel(vecKernel); clReleaseProgram(texProg); clReleaseProgram(vecProg); free(memDivergenceResults); } else if (testType == LocalMemCapacity) { char build_options[128]; const char* local_mem_define_prefix = "-D LATENCY_LOCAL_MEM_SIZE="; memset(build_options, 0, 128); memcpy(build_options, local_mem_define_prefix, 26); snprintf(build_options + 26, 128 - 26, "%u", 256 * local_mem_size_kb); cl_program program = build_program(context, "local_mem_latency_kernel.cl", build_options); cl_kernel local_mem_capacity_kernel = clCreateKernel(program, "unrolled_latency_test_localmem", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Could not create local mem capacity testing kernel\n"); exit(0); } if (saveprogram) write_program(program, "local_mem_latency_kernel"); fprintf(stderr, "Testing local memory capacity with %u KB of local mem per WG, up to %u WGs\n", local_mem_size_kb, group_count); printf("Groups,Local Mem Capacity,Latency\n"); for (int groups = 1; groups <= group_count; groups++) { result = latency_test(context, command_queue, local_mem_capacity_kernel, 256 * sizeKb, (uint32_t)scale_iterations(sizeKb, chase_iterations), true, groups, 1, 1, 64, NULL); printf("%d,%d,%f\n", groups, groups* local_mem_size_kb, result); } clReleaseKernel(local_mem_capacity_kernel); clReleaseProgram(program); } else if (testType == ConstantMemLatency) { cl_program prog = build_program(context, "constant_unrolled_latency_test.cl", NULL); cl_kernel constant_kernel = clCreateKernel(prog, "constant_unrolled_latency_test", &ret); if (saveprogram) write_program(prog, "constant_unrolled_latency_test"); cl_ulong max_constant_test_size = get_max_constant_buffer_size(); printf("\nSattolo, constant memory (up to %llu K), no-unroll:\n", max_constant_test_size / 1024); for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) { if (max_constant_test_size < sizeof(int) * 256 * default_test_sizes[size_idx]) { printf("%d K would exceed device's max constant buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_constant_test_size / 1024); break; } result = latency_test(context, command_queue, constant_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), false, thread_count, local_size, wave, stride, NULL); printf("%d,%f\n", default_test_sizes[size_idx], result); if (result == 0) { printf("Something went wrong, not testing anything bigger.\n"); break; } } clReleaseKernel(constant_kernel); clReleaseProgram(program); } else if (testType == TexMemLatency) { cl_program prog = build_program(context, "tex_latency_test.cl", NULL); cl_kernel tex_latency_kernel = clCreateKernel(prog, "tex_latency_test", &ret); if (saveprogram) write_program(prog, "tex_latency_test"); cl_ulong max_tex_test_size = get_max_tex_buffer_size(); for (int size_idx = 0; size_idx < sizeof(default_test_sizes) / sizeof(int); size_idx++) { if (default_test_sizes[size_idx] * 1024 > max_tex_test_size) { printf("%d K would exceed device's texture buffer size of %llu K, stopping here.\n", default_test_sizes[size_idx], max_tex_test_size / 1024); break; } result = tex_latency_test(context, command_queue, tex_latency_kernel, 256 * default_test_sizes[size_idx], scale_iterations(default_test_sizes[size_idx], chase_iterations), thread_count, local_size, wave); printf("%d,%f\n", default_test_sizes[size_idx], result); if (result == 0) { printf("Something went wrong, not testing anything bigger.\n"); break; } } clReleaseKernel(tex_latency_kernel); clReleaseProgram(prog); } else if (testType == LocalMemLatency) { cl_program prog = build_program(context, "local_unrolled_latency_test.cl", NULL); cl_kernel local_kernel = clCreateKernel(prog, "local_unrolled_latency_test", &ret); if (saveprogram) write_program(prog, "local_unrolled_latency_test"); uint32_t elapsed_ms = 0, target_ms = 2000; chase_iterations = 50000; while (elapsed_ms < target_ms / 2) { result = latency_test(context, command_queue, local_kernel, 1024, chase_iterations, false, thread_count, local_size, wave, stride, &elapsed_ms); fprintf(stderr, "%u iterations, %u ms -> %f ns\n", chase_iterations, elapsed_ms, result); chase_iterations = scale_iterations_to_target(chase_iterations, elapsed_ms, target_ms); } printf("Local mem latency: %f\n", result); clReleaseKernel(local_kernel); clReleaseProgram(prog); } else if (testType == GlobalMemBandwidth) { cl_program prog = build_program(context, "sum_bw_test.cl", NULL); cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret); if (saveprogram) write_program(prog, "sum_bw_test"); fprintf(stderr, "Using %u threads, %u local size, %u base iterations\n", thread_count, local_size, chase_iterations); printf("\nMemory bandwidth (up to %llu K):\n", max_global_test_size / 1024); if (!sizeKb) { for (int size_idx = 0; size_idx < sizeof(default_bw_test_sizes) / sizeof(unsigned long long); size_idx++) { uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024; if ((max_global_test_size / 1024) < testSizeKb) { printf("%llu K would exceed device's max buffer size of %llu K, stopping here.\n", testSizeKb, max_global_test_size / 1024); break; } result = bw_test(context, command_queue, bw_kernel, 256 * testSizeKb, thread_count, local_size, skip, scale_bw_iterations(chase_iterations, testSizeKb)); printf("%llu,%f\n", testSizeKb, result); if (result == 0) { printf("Something went wrong, not testing anything bigger.\n"); break; } } } else { result = bw_test(context, command_queue, bw_kernel, 256 * sizeKb, thread_count, local_size, skip, scale_bw_iterations(chase_iterations, sizeKb)); printf("%lu,%f\n", sizeKb, result); if (result == 0) { printf("Something went wrong, not testing anything bigger.\n"); } } clReleaseKernel(bw_kernel); clReleaseProgram(prog); } else if (testType == LocalMemBandwidth || testType == LocalMem64Bandwidth || testType == BufferBandwidth || testType == LoadStoreBandwidth || testType == TextureThroughput || testType == LocalMemFloat4Bandwidth || testType == MixedFloat4Bandwidth) { cl_program prog; cl_kernel local_bw_kernel = NULL, local_64_bw_kernel = NULL, local_float4_bw_kernel = NULL, buffer_bw_kernel = NULL, tex_bw_kernel = NULL, loadstore_bw_kernel = NULL; cl_kernel mixed_bw_kernel = NULL; if (testType == LocalMemBandwidth) { prog = build_program(context, "local_bw_test.cl", NULL); local_bw_kernel = clCreateKernel(prog, "local_bw_test", &ret); if (saveprogram) write_program(prog, "local_bw_test"); } else if (testType == LocalMem64Bandwidth) { prog = build_program(context, "local_64_bw_test.cl", NULL); local_64_bw_kernel = clCreateKernel(prog, "local_64_bw_test", &ret); if (saveprogram) write_program(prog, "local_64_bw_test"); } else if (testType == LocalMemFloat4Bandwidth) { prog = build_program(context, "local_float4_bw_test.cl", NULL); local_float4_bw_kernel = clCreateKernel(prog, "local_float4_bw_test", &ret); if (saveprogram) write_program(prog, "local_float4_bw_test"); } else if (testType == BufferBandwidth) { prog = build_program(context, "buffer_bw_test.cl", NULL); buffer_bw_kernel = clCreateKernel(prog, "buffer_bw_test", &ret); if (saveprogram) write_program(prog, "buffer_bw_test"); } else if (testType == LoadStoreBandwidth) { prog = build_program(context, "ldst_bw_test.cl", NULL); loadstore_bw_kernel = clCreateKernel(prog, "ldst_bw_test", &ret); if (saveprogram) write_program(prog, "ldst_bw_test"); } else if (testType == MixedFloat4Bandwidth) { prog = build_program(context, "local_float4_bw_test.cl", NULL); mixed_bw_kernel = clCreateKernel(prog, "mixed_float4_bw_test", NULL); if (saveprogram) write_program(prog, "mixed_float4_bw_test"); } else { // tex throughput prog = build_program(context, "tex_bw_test.cl", NULL); tex_bw_kernel = clCreateKernel(prog, "tex_bw_test", &ret); if (saveprogram) write_program(prog, "tex_bw_test"); } uint32_t thread_low = 1024, thread_high = 1048576*4; if (!thread_count_set) thread_count = thread_low; float max_bw = 0; while (true) { int64_t elapsed_ms = 0, target_ms = 1500; if (!chase_iterations_set) chase_iterations = 500000; while (elapsed_ms < target_ms / 2) { if (testType == LocalMemBandwidth) { fprintf(stderr, "Testing local mem bw\n"); result = local_bw_test(context, command_queue, local_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == LocalMem64Bandwidth) { fprintf(stderr, "Testing local mem bw with 64-bit loads\n"); result = local_64_bw_test(context, command_queue, local_64_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == LocalMemFloat4Bandwidth) { fprintf(stderr, "Testing local mem bw with float4 loads\n"); result = local_bw_test(context, command_queue, local_float4_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == MixedFloat4Bandwidth) { fprintf(stderr, "Testing mixed local/global bw with float4 loads\n"); result = local_bw_test(context, command_queue, mixed_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == BufferBandwidth) { fprintf(stderr, "Testing buffer bw\n"); result = buffer_bw_test(context, command_queue, buffer_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == LoadStoreBandwidth) { fprintf(stderr, "Testing global load bandwidth\n"); result = local_bw_test(context, command_queue, loadstore_bw_kernel, thread_count, local_size, chase_iterations, &elapsed_ms); } else if (testType == TextureThroughput) { fprintf(stderr, "Testing texture throughput\n"); result = tex_bw_test(context, command_queue, tex_bw_kernel, 256, // width 256, // height thread_count, local_size, 0, chase_iterations, &elapsed_ms); } fprintf(stderr, "%u threads, %u local size, %u iterations ==> %f GB/s, elapsed time %lld ms\n", thread_count, local_size, chase_iterations, result, elapsed_ms); if (elapsed_ms < 25) chase_iterations *= 2; else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms)); if (result == 0) { fprintf(stderr, "Run failed\n"); break; } if (chase_iterations_set) break; } if (result > max_bw) max_bw = result; if (thread_count_set) break; thread_count *= 2; if (thread_count > thread_high) break; } printf("Bandwidth: %f GB/s\n", max_bw); } else if (testType == LocalMemChaseBandwidth) { int thread_scan_done = 0; uint32_t thread_low = 256, thread_high = 524288 * 4; fprintf(stderr, "Testing local memory bandwidth using pointer chasing. Ensure wave size is set correctly with -wave\n"); if (!thread_count_set) thread_count = thread_low; while (!thread_scan_done) { // ignore chase iterations and auto manage it int64_t elapsed_ms = 0, target_ms = 1500; chase_iterations = 500000; if (thread_count_set) thread_scan_done = 0; else { thread_count *= 2; if (thread_count > thread_high) break; } while (elapsed_ms < target_ms / 2) { result = local_chase_bw_test(context, command_queue, local_bw_chase_kernel, thread_count, local_size, chase_iterations, wave, &elapsed_ms); fprintf(stderr, "%u threads, %u local size, %u wave, %u iterations ==> %f GB/s, elapsed time %lld ms\n", thread_count, local_size, wave, chase_iterations, result, elapsed_ms); if (elapsed_ms < 25) chase_iterations *= 2; else chase_iterations = (uint32_t)((float)chase_iterations * (target_ms / elapsed_ms)); if (result == 0) { fprintf(stderr, "Run failed\n"); break; } } } printf("Local memory bandwidth: %f GB/s\n", result); } else if (testType == MemBandwidthWorkgroupScaling) { cl_program prog = build_program(context, "sum_bw_test.cl", NULL); cl_kernel bw_kernel = clCreateKernel(prog, "sum_bw_test", &ret); if (saveprogram) write_program(prog, "sum_bw_test"); uint32_t testSizeCount = sizeof(default_bw_test_sizes) / sizeof(unsigned long long); cl_uint cuCount = forceCuCount ? forceCuCount : getCuCount(); fprintf(stderr, "Device has %u compute units\n", cuCount); float* scalingResults = (float*)malloc(sizeof(float) * cuCount * testSizeCount); for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++) { if (!sizeKb) { for (int size_idx = 0; size_idx < testSizeCount; size_idx++) { uint64_t testSizeKb = default_bw_test_sizes[size_idx] / 1024; fprintf(stderr, "Testing size %llu KB, %u workgroups\n", testSizeKb, workgroupCount); if ((max_global_test_size / 1024) < testSizeKb) { printf("%llu K would exceed device's max buffer size of %llu K\n", testSizeKb, max_global_test_size / 1024); scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = 0; continue; } result = bw_test(context, command_queue, bw_kernel, 256 * testSizeKb, local_size * workgroupCount, local_size, skip, scale_bw_iterations(chase_iterations, testSizeKb)); scalingResults[(workgroupCount - 1) * testSizeCount + size_idx] = result; fprintf(stderr, "%u workgroups, %llu KB = %f GB/s\n", workgroupCount, testSizeKb, result); } } else { fprintf(stderr, "Testing size %d KB, %u workgroups\n", sizeKb, workgroupCount); result = bw_test(context, command_queue, bw_kernel, 256 * sizeKb, local_size * workgroupCount, local_size, skip, scale_bw_iterations(chase_iterations, sizeKb)); scalingResults[workgroupCount - 1] = result; fprintf(stderr, "%u workgroups, %lu KB = %f GB/s\n", workgroupCount, sizeKb, result); } } if (!sizeKb) { for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++) { printf(",%u", workgroupCount); } printf("\n"); for (int size_idx = 0; size_idx < testSizeCount; size_idx++) { printf("%llu", default_bw_test_sizes[size_idx] / 1024); for (uint32_t workgroupCount = 1; workgroupCount <= cuCount; workgroupCount++) { printf(",%f", scalingResults[(workgroupCount - 1) * testSizeCount + size_idx]); } printf("\n"); } } else { printf("For %d KB:\n", sizeKb); for (int workgroupIdx = 0; workgroupIdx < cuCount; workgroupIdx++) { printf("%d,%f\n", workgroupIdx + 1, scalingResults[workgroupIdx]); } printf("\n"); } free(scalingResults); clReleaseKernel(bw_kernel); clReleaseProgram(prog); } else if (testType == CoreToCore) { c2c_atomic_latency_test(context, command_queue, c2c_atomic_latency_test_kernel, chase_iterations); } else if (testType == LinkBandwidth) { link_bw_test(context, command_queue, dummy_add_kernel, chase_iterations); } else if (testType == InstructionRate) { instruction_rate_test(context, command_queue, thread_count, local_size, chase_iterations, forcefp16, forcefp64); } else if (testType == Divergence) { int current_wave = 1; int max_wave = 512; printf("Contiguous Thread Block Size,FP32 GOPs\n"); while (current_wave <= max_wave) { float gops = run_divergence_rate_test(context, command_queue, thread_count, local_size, current_wave, NULL); printf("%d,%f\n", current_wave, gops); current_wave *= 2; } } else if (testType == Partition) { // function and its associated kernel serve two purposes int pattern4[] = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 }; float result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern4); printf("Throughput: %f\n", result); int patterns[] = { 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0 }; result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, patterns); printf("Throughput: %f\n", result); int pattern2[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, pattern2); printf("Throughput: %f\n", result); int consec_pattern[] = { 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0 }; result = run_divergence_rate_test(context, command_queue, thread_count, local_size, wave, consec_pattern); printf("Throughput: %f\n", result); } //printf("If you didn't run this through cmd, now you can copy the results. And press ctrl+c to close"); //scanf("\n"); // Clean up cleanup: ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseProgram(program); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; } /// /// Heuristic to make sure test runs for enough time but not too long /// /// Region size /// base iterations /// scaled iterations uint64_t scale_iterations(uint32_t size_kb, uint64_t iterations) { return 10 * iterations / pow(size_kb, 1.0 / 4.0); } #define INT_EXEC_INPUT_SIZE 16 float int_exec_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations) { cl_int ret; cl_int result = 0; size_t global_item_size = 1; size_t local_item_size = 1; float latency; uint32_t time_diff_ms; uint32_t A[INT_EXEC_INPUT_SIZE]; for (int i = 0; i < INT_EXEC_INPUT_SIZE; i++) A[i] = i; cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), NULL, &ret); cl_mem result_obj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, &result); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, INT_EXEC_INPUT_SIZE * sizeof(uint32_t), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, result_obj, CL_TRUE, 0, sizeof(cl_int), &result, 0, NULL, NULL); clFinish(command_queue); clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&a_mem_obj); clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&iterations); clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&result_obj); start_timing(); ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { fprintf(stderr, "Failed to submit kernel to command queue. clEnqueueNDRangeKernel returned %d\n", ret); latency = 0; goto cleanup; } clFinish(command_queue); time_diff_ms = end_timing(); latency = 1e6 * (float)time_diff_ms / (float)(iterations * 12); cleanup: clFlush(command_queue); clFinish(command_queue); clReleaseMemObject(a_mem_obj); clReleaseMemObject(result_obj); return latency; } uint32_t scale_bw_iterations(uint32_t base_iterations, uint32_t size_kb) { if (size_kb < 4096) return base_iterations; else return base_iterations / 2; } ================================================ FILE: GpuMemLatency/opencltest.h ================================================ #pragma once #ifndef opencltestheader #define opencltestheader #include #include #include #include #include #include "../Common/timing.h" #define false 0 #define true 1 #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #ifndef __APPLE__ #include #else #include #endif #define MAX_SOURCE_SIZE (0x100000) #define CACHELINE_SIZE 64 #define TARGET_TIME_MS 2000 #ifndef _MSC_VER #define _strnicmp strncmp #endif extern cl_device_id selected_device_id; extern cl_platform_id selected_platform_id; extern cl_ulong max_global_test_size; extern int saveprogram; cl_context get_context_from_user(int platform_index, int device_index); cl_program build_program(cl_context context, const char* fname, const char *params); void write_program(cl_program program, const char *name); uint32_t adjust_iterations(uint32_t iterations, uint64_t time_ms); void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment); cl_uint getCuCount(); size_t getMaxWorkgroupSize(); cl_ulong get_max_constant_buffer_size(); cl_ulong get_max_buffer_size(); cl_ulong get_max_tex_buffer_size(); cl_ulong get_max_2d_tex_width(); cl_ulong get_max_2d_tex_height(); float int_atomic_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations, short local, uint32_t *time_ms); float int_atomic_add_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, size_t threads, size_t localsize); float latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t list_size, uint32_t chase_iterations, short uniform, int threads, int local_size, int wave, int stride, uint32_t *elapsed_ms); float tex_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t list_size, uint32_t chase_iterations, int threads, int local_size, int wave_size); float bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint64_t list_size, uint32_t thread_count, uint32_t local_size, uint32_t skip, uint32_t chase_iterations); float tex_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint64_t width, uint64_t height, uint32_t thread_count, uint32_t local_size, uint32_t randomize, uint32_t chase_iterations, int64_t *time_ms); float local_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t *time_ms); float local_chase_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, uint32_t wave_size, int64_t* time_ms); float local_64_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t* time_ms); float buffer_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int64_t* time_ms); void link_bw_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations); float c2c_atomic_latency_test(cl_context context, cl_command_queue command_queue, cl_kernel kernel, uint32_t iterations); float instruction_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t chase_iterations, int forcefp16, int forcefp64); float run_divergence_rate_test(cl_context context, cl_command_queue command_queue, uint32_t thread_count, uint32_t local_size, uint32_t wave, int *pattern); #endif ================================================ FILE: GpuMemLatency/opencltest.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 16 VisualStudioVersion = 16.0.30503.244 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "opencltest", "opencltest.vcxproj", "{FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.ActiveCfg = Debug|x64 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x64.Build.0 = Debug|x64 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.ActiveCfg = Debug|Win32 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Debug|x86.Build.0 = Debug|Win32 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.ActiveCfg = Release|x64 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x64.Build.0 = Release|x64 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.ActiveCfg = Release|Win32 {FA51D7F4-F6E0-4CB5-9CDD-AD39A3519F78}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {4447E91D-E7A1-4249-87A7-E75A78167E71} EndGlobalSection EndGlobal ================================================ FILE: GpuMemLatency/opencltest.vcxproj ================================================ Debug Win32 Release Win32 Debug x64 Release x64 16.0 Win32Proj {fa51d7f4-f6e0-4cb5-9cdd-ad39a3519f78} opencltest 10.0 Application true v143 Unicode Application false v143 true Unicode Application true v143 Unicode Application false v143 true Unicode true false true false Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _CRT_SECURE_NO_WARNINGS;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true $(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories) Console true $(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories) OpenCL.lib;%(AdditionalDependencies) Level3 true true true _CRT_SECURE_NO_WARNINGS;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true $(SolutionDir)\OpenCL\include;%(AdditionalIncludeDirectories) Console true true true $(SolutionDir)\OpenCL\lib;%(AdditionalLibraryDirectories) OpenCL.lib;%(AdditionalDependencies) false CppCode false Document false Document false Document false Document Document Document Document Document Document Document Document Document Document Document Document Document Document Document ================================================ FILE: GpuMemLatency/opencltest.vcxproj.filters ================================================  ================================================ FILE: GpuMemLatency/texturetest.c ================================================ #include "opencltest.h" ================================================ FILE: InstructionRate/Makefile ================================================ include ../Common/arch_detect.mk CFLAGS = -O3 all: $(TARGET) amd64: $(CC) $(CFLAGS) x86_instructionrate.s x86_instructionrate.c -o InstructionRate_amd64 $(LDFLAGS) aarch64: $(CC) $(CFLAGS) -march=native -pthread arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS) riscv64: $(CC) $(CFLAGS) -march=rv64gc -pthread riscv_instructionrate.s riscv_instructionrate.c -o InstructionRate_riscv64 $(LDFLAGS) termux: clang -march=armv8+aes arm_instructionrate.s arm_instructionrate.c -o InstructionRate_aarch64 $(LDFLAGS) amd64_fusion: $(CC) $(CFLAGS) x86_fusion.s x86_fusion.c -o InstructionRateFusion_amd64 $(LDFLAGS) w64: $(CC) $(CFLAGS) x86_instructionrate.c x86_instructionrate.s -o InstructionRate_w64.exe $(LDFLAGS) ci: amd64 amd64_fusion aarch64 riscv64 w64 clean: rm -f *.o && find . -type f -executable -delete .PHONY: all ci clean ================================================ FILE: InstructionRate/arm_instructionrate.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include extern uint64_t noptest(uint64_t iterations); extern uint64_t clktest(uint64_t iterations); extern uint64_t addtest(uint64_t iterations); extern uint64_t eortest(uint64_t iterations); extern uint64_t maddaddtest(uint64_t iterations); extern uint64_t cmptest(uint64_t iterations); extern uint64_t addmultest(uint64_t iterations); extern uint64_t addmul21test(uint64_t iterations); extern uint64_t mul32test(uint64_t iterations); extern uint64_t mul64test(uint64_t iterations); extern uint64_t latmul64test(uint64_t iterations); extern uint64_t jmptest(uint64_t iterations); extern uint64_t fusejmptest(uint64_t iterations); extern uint64_t mixmuljmptest(uint64_t iterations); extern uint64_t mixmuljmptest21(uint64_t iterations); extern uint64_t mixaddjmptest(uint64_t iterations); extern uint64_t mixaddjmp21test(uint64_t iterations); extern uint64_t rortest(uint64_t iterations); extern uint64_t mixmulrortest(uint64_t iterations); extern uint64_t vecadd128test(uint64_t iterations, int arr[4]); extern uint64_t latvecadd128test(uint64_t iterations, int arr[4]); extern uint64_t vecmul128test(uint64_t iterations, int arr[4]); extern uint64_t latvecmul128test(uint64_t iterations, int arr[4]); extern uint64_t mixvecaddmul128test(uint64_t iterations, int arr[4]); extern uint64_t faddtest(uint64_t iterations, float arr[4]); extern uint64_t latfaddtest(uint64_t iterations, float arr[4]); extern uint64_t vecfadd128test(uint64_t iterations, float arr[4]); extern uint64_t vecfmul128test(uint64_t iterations, float arr[4]); extern uint64_t latvecfadd128test(uint64_t iterations, float arr[4]); extern uint64_t latvecfmul128test(uint64_t iterations, float arr[4]); extern uint64_t mixvecfaddfmul128test(uint64_t iterations, float arr[4]); extern uint64_t vecfma128test(uint64_t iterations, float arr[4]); extern uint64_t scalarfmatest(uint64_t iterations, float arr[4]); extern uint64_t latvecfma128test(uint64_t iterations, float arr[4]); extern uint64_t latscalarfmatest(uint64_t iterations, float arr[4]); extern uint64_t mixvecfaddfma128test(uint64_t iterations, float arr[4]); extern uint64_t mixvecfmulfma128test(uint64_t iterations, float arr[4]); // see if SIMD pipeline shares ports with scalar ALU ones extern uint64_t mixaddvecadd128test(uint64_t iterations, int arr[4]); extern uint64_t mix3to1addvecadd128test(uint64_t iterations, int arr[4]); extern uint64_t mix1to1addvecadd128test(uint64_t iterations, int arr[4]); extern uint64_t mixmulvecmultest(uint64_t iterations, int arr[4]); // are vec int and vec fp on the same port? extern uint64_t mixvecmulfmultest(uint64_t iterations, float farr[4], int iarr[4]); extern uint64_t mixvecaddfaddtest(uint64_t iterations, float farr[4], int iarr[4]); // where are the branch ports extern uint64_t mixjmpvecaddtest(uint64_t iterations, int arr[4]); extern uint64_t mixjmpvecmultest(uint64_t iterations, int arr[4]); // load/store extern uint64_t loadtest(uint64_t iterations, int arr[4]); extern uint64_t mixloadstoretest(uint64_t iterations, int arr[4], int sink[4]); extern uint64_t mix21loadstoretest(uint64_t iterations, int arr[4], int sink[4]); extern uint64_t vecloadtest(uint64_t iterations, int arr[4]); extern uint64_t vecstoretest(uint64_t iterations, int arr[4], int sink[4]); // renamer tests extern uint64_t indepmovtest(uint64_t iterations); extern uint64_t depmovtest(uint64_t iterations); extern uint64_t xorzerotest(uint64_t iterations); extern uint64_t movzerotest(uint64_t iterations); extern uint64_t subzerotest(uint64_t iterations); // Is crypto separate extern uint64_t aesetest(uint64_t iterations, int arr[4]); extern uint64_t mixaesevecadd128test(uint64_t iterations, int arr[4]); extern uint64_t pmulltest(uint64_t iterations, int arr[4]); extern uint64_t mixpmulladd128test(uint64_t iterations, int arr[4]); float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 }; int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 }; int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 }; float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t)); uint64_t vecadd128wrapper(uint64_t iterations); uint64_t latvecadd128wrapper(uint64_t iterations); uint64_t vecmul128wrapper(uint64_t iterations); uint64_t latvecmul128wrapper(uint64_t iterations); uint64_t mixvecaddmul128wrapper(uint64_t iterations); uint64_t faddwrapper(uint64_t iterations); uint64_t latfaddwrapper(uint64_t iterations); uint64_t vecfadd128wrapper(uint64_t iterations); uint64_t latvecfadd128wrapper(uint64_t iterations); uint64_t vecfmul128wrapper(uint64_t iterations); uint64_t latvecfmul128wrapper(uint64_t iterations); uint64_t mixvecfaddfmul128wrapper(uint64_t iterations); uint64_t mixaddvecadd128wrapper(uint64_t iterations); uint64_t mix3to1addvecadd128wrapper(uint64_t iterations); uint64_t mix1to1addvecadd128wrapper(uint64_t iterations); uint64_t mixmulvecmulwrapper(uint64_t iterations); uint64_t mixvecmulfmulwrapper(uint64_t iterations); uint64_t mixvecaddfaddwrapper(uint64_t iterations); uint64_t mixjmpvecaddwrapper(uint64_t iterations); uint64_t mixjmpvecmulwrapper(uint64_t iterations); uint64_t vecloadwrapper(uint64_t iterations); uint64_t loadwrapper(uint64_t iterations); uint64_t vecstorewrapper(uint64_t iterations); uint64_t mixloadstorewrapper(uint64_t iterations); uint64_t mix21loadstorewrapper(uint64_t iterations); uint64_t vecfma128wrapper(uint64_t iterations); uint64_t scalarfmawrapper(uint64_t iterations); uint64_t latscalarfmawrapper(uint64_t iterations); uint64_t mixvecfaddfma128wrapper(uint64_t iterations); uint64_t mixvecfmulfma128wrapper(uint64_t iterations); uint64_t latvecfma128wrapper(uint64_t iteration); uint64_t aesetestwrapper(uint64_t iterations); uint64_t mixaesevecadd128wrapper(uint64_t iterations); uint64_t pmullwrapper(uint64_t iterations); uint64_t mixpmulladd128wrapper(uint64_t iterations); int threads = 0, hardaffinity = 0; cpu_set_t cpuset; int main(int argc, char *argv[]) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t iterations = 1500000000; uint64_t iterationsHigh = iterations * 5; uint64_t time_diff_ms; float latency, opsPerNs, clockSpeedGhz; if (argc > 1) { for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "affinity", 8) == 0) { argIdx++; int targetCpu = atoi(argv[argIdx]); CPU_ZERO(&cpuset); CPU_SET(targetCpu, &cpuset); sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); fprintf(stderr, "Set affinity to %d\n", targetCpu); } else if (strncmp(arg, "hardaffinity", 12) == 0) { CPU_ZERO(&cpuset); CPU_SET(0, &cpuset); CPU_SET(1, &cpuset); sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); fprintf(stderr, "Set affinity 2,3\n"); hardaffinity = 1; } else if (strncmp(arg, "threads", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); fprintf(stderr, "Multithreading mode, %d threads\n", threads); } else if (strncmp(arg, "iter", 4) == 0) { argIdx++; int iterMul = atoi(argv[argIdx]); iterations *= iterMul; iterationsHigh *= iterMul; fprintf(stderr, "Scaled iterations by %d\n", iterMul); } } } } // figure out clock speed gettimeofday(&startTv, &startTz); clktest(iterations); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; // clk speed should be 1/latency, assuming we got one add per clk, roughly clockSpeedGhz = 1/latency; printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz); printf("Nops per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest)); printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest)); printf("XORs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, eortest)); printf("CMPs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmptest)); printf("\n----Renamer Tests----\n"); printf("Indepdent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest)); printf("Dependent movs per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest)); printf("eor -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest)); printf("mov -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest)); printf("sub -> 0 per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest)); printf("\n----ALU Pipe Layout Tests----\n"); printf("Not taken jmps per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest)); printf("Jump fusion test> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fusejmptest)); printf("1:1 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest)); printf("1:2 mixed not taken jmps / muls per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmuljmptest21)); printf("1:1 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmptest)); printf("1:2 mixed not taken jmps / adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddjmp21test)); printf("1:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest)); printf("2:1 mixed add/mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmul21test)); printf("ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest)); printf("1:1 mixed mul/ror per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulrortest)); printf("1:3 madd:add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, maddaddtest)); printf("32-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); printf("64-bit mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul32test)); printf("64-bit multiply latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64test)); printf("\n----FP/ASIMD Crypto Tests----\n"); printf("aese per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, aesetestwrapper)); printf("1:1 aese and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaesevecadd128wrapper)); printf("pmull per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, pmullwrapper)); printf("1:1 pmull and vec 128 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixpmulladd128wrapper)); printf ("\n----FP/ASIMD Tests----\n"); printf("scalar fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, faddwrapper)); printf("128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecadd128wrapper)); printf("128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecmul128wrapper)); printf("128-bit vec int32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddmul128wrapper)); printf("128-bit vec fp32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfadd128wrapper)); printf("128-bit vec fp32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfmul128wrapper)); printf("128-bit vec fp32 mixed multiply and add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfmul128wrapper)); printf("2:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixaddvecadd128wrapper)); printf("3:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix3to1addvecadd128wrapper)); printf("1:1 mixed scalar adds and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix1to1addvecadd128wrapper)); printf("1:1 mixed scalar 32-bit multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixmulvecmulwrapper)); printf("1:1 mixed 128-bit vec fp32 multiply and 128-bit vec int32 multiply per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecmulfmulwrapper)); printf("1:1 mixed 128-bit vec fp32 add and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecaddfaddwrapper)); printf("1:2 mixed not taken jumps and 128-bit vec int32 add per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecaddwrapper)); printf("1:1 mixed not taken jumps and 128-bit vec int32 mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixjmpvecmulwrapper)); printf("128-bit vec int32 add latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecadd128wrapper)); printf("128-bit vec int32 mul latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecmul128wrapper)); printf("Scalar FADD Latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latfaddwrapper)); printf("128-bit vector FADD latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfadd128wrapper)); printf("128-bit vector FMUL latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfmul128wrapper)); printf("128-bit vector FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecfma128wrapper)); printf("128-bit vector FMA latency> %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latvecfma128wrapper)); printf("Scalar FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, scalarfmawrapper)); printf("Scalar FMA latency> %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latscalarfmawrapper)); printf("1:1 mixed 128-bit vector FMA/FADD per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfaddfma128wrapper)); printf("1:1 mixed 128-bit vector FMA/FMUL per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixvecfmulfma128wrapper)); printf("\n----Load/Store Tests----\n"); printf("128-bit vec loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecloadwrapper)); printf("128-bit vec stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecstorewrapper)); printf("64-bit loads per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, loadwrapper)); printf("1:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixloadstorewrapper)); printf("2:1 mixed 64-bit loads/stores per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix21loadstorewrapper)); return 0; } struct TestThreadData { uint64_t iterations; uint64_t (*testfunc)(uint64_t); }; void *TestThread(void *param) { struct TestThreadData *testData = (struct TestThreadData *)param; if (hardaffinity) { sched_setaffinity(gettid(), sizeof(cpu_set_t), &cpuset); } testData->testfunc(testData->iterations); return NULL; } float measureFunction(uint64_t iterations, float clockSpeedGhz, uint64_t (*testfunc)(uint64_t)) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t time_diff_ms; float latency, opsPerNs; gettimeofday(&startTv, &startTz); if (threads == 0) testfunc(iterations); else { pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t)); struct TestThreadData *testData = (struct TestThreadData *)malloc(threads * sizeof(struct TestThreadData)); for (int threadIdx = 0; threadIdx < threads; threadIdx++) { testData[threadIdx].iterations = iterations; testData[threadIdx].testfunc = testfunc; pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx); } for (int threadIdx = 0; threadIdx < threads; threadIdx++) { pthread_join(testThreads[threadIdx], NULL); } free(testThreads); free(testData); } gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; opsPerNs = 1/latency; //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz); return opsPerNs / clockSpeedGhz; } uint64_t vecadd128wrapper(uint64_t iterations) { return vecadd128test(iterations, intTestArr); } uint64_t vecmul128wrapper(uint64_t iterations) { return vecmul128test(iterations, intTestArr); } uint64_t latvecadd128wrapper(uint64_t iterations) { return latvecadd128test(iterations, intTestArr); } uint64_t latvecmul128wrapper(uint64_t iterations) { return latvecmul128test(iterations, intTestArr); } uint64_t mixvecaddmul128wrapper(uint64_t iterations) { return mixvecaddmul128test(iterations, intTestArr); } uint64_t faddwrapper(uint64_t iterations) { return faddtest(iterations, fpTestArr); } uint64_t latfaddwrapper(uint64_t iterations) { return latfaddtest(iterations, fpTestArr); } uint64_t latvecfadd128wrapper(uint64_t iterations) { return latvecfadd128test(iterations, fpTestArr); } uint64_t latvecfmul128wrapper(uint64_t iterations) { return latvecfmul128test(iterations, fpTestArr); } uint64_t vecfadd128wrapper(uint64_t iterations) { return vecfadd128test(iterations, fpTestArr); } uint64_t vecfmul128wrapper(uint64_t iterations) { return vecfmul128test(iterations, fpTestArr); } uint64_t mixvecfaddfmul128wrapper(uint64_t iterations) { return mixvecfaddfmul128test(iterations, fpTestArr); } uint64_t mixaddvecadd128wrapper(uint64_t iterations) { return mixaddvecadd128test(iterations, intTestArr); } uint64_t mix3to1addvecadd128wrapper(uint64_t iterations) { return mix3to1addvecadd128test(iterations, intTestArr); } uint64_t mix1to1addvecadd128wrapper(uint64_t iterations) { return mix1to1addvecadd128test(iterations, intTestArr); } uint64_t mixmulvecmulwrapper(uint64_t iterations) { return mixmulvecmultest(iterations, intTestArr); } uint64_t mixvecmulfmulwrapper(uint64_t iterations) { return mixvecmulfmultest(iterations, fpTestArr, intTestArr); } uint64_t mixvecaddfaddwrapper(uint64_t iterations) { return mixvecaddfaddtest(iterations, fpTestArr, intTestArr); } uint64_t mixjmpvecaddwrapper(uint64_t iterations) { return mixjmpvecaddtest(iterations, intTestArr); } uint64_t mixjmpvecmulwrapper(uint64_t iterations) { return mixjmpvecmultest(iterations, intTestArr); } uint64_t vecloadwrapper(uint64_t iterations) { return vecloadtest(iterations, intTestArr); } uint64_t vecstorewrapper(uint64_t iterations) { return vecstoretest(iterations, intTestArr, sinkArr); } uint64_t loadwrapper(uint64_t iterations) { if (((uint64_t)intTestArr & 63) != 0) { printf("Warning - load may not be 64B aligned\n"); } return loadtest(iterations, intTestArr); } uint64_t mixloadstorewrapper(uint64_t iterations) { return mixloadstoretest(iterations, intTestArr, sinkArr); } uint64_t mix21loadstorewrapper(uint64_t iterations) { return mix21loadstoretest(iterations, intTestArr, sinkArr); } uint64_t vecfma128wrapper(uint64_t iterations) { return vecfma128test(iterations, fpTestArr); } uint64_t scalarfmawrapper(uint64_t iterations) { return scalarfmatest(iterations, fpTestArr); } uint64_t latscalarfmawrapper(uint64_t iterations) { return latscalarfmatest(iterations, fpTestArr); } uint64_t latvecfma128wrapper(uint64_t iterations) { return latvecfma128test(iterations, fpTestArr); } uint64_t mixvecfmulfma128wrapper(uint64_t iterations) { return mixvecfmulfma128test(iterations, fpTestArr); } uint64_t mixvecfaddfma128wrapper(uint64_t iterations) { return mixvecfaddfma128test(iterations, fpTestArr); } uint64_t aesetestwrapper(uint64_t iterations) { return aesetest(iterations, intTestArr); } uint64_t mixaesevecadd128wrapper(uint64_t iterations) { return mixaesevecadd128test(iterations, intTestArr); } uint64_t pmullwrapper(uint64_t iterations) { return pmulltest(iterations, intTestArr); } uint64_t mixpmulladd128wrapper(uint64_t iterations) { return mixpmulladd128test(iterations, intTestArr); } ================================================ FILE: InstructionRate/arm_instructionrate.s ================================================ .text .global clktest .global addtest .global eortest .global maddaddtest .global cmptest .global addmultest .global addmul21test .global mixaddjmp21test .global mul32test .global mul64test .global latmul64test .global noptest .global fusejmptest .global jmptest .global mixmuljmptest .global mixmuljmptest21 .global mixaddjmptest .global rortest .global mixmulrortest .global _clktest .global _addtest .global _eortest .global _maddaddtest .global _cmptest .global _addmultest .global _addmul21test .global _mixaddjmp21test .global _mul32test .global _mul64test .global _latmul64test .global _noptest .global _fusejmptest .global _jmptest .global _mixmuljmptest .global _mixmuljmptest21 .global _mixaddjmptest .global _rortest .global _mixmulrortest .global vecadd128test .global latvecadd128test .global vecmul128test .global latvecmul128test .global mixvecaddmul128test .global faddtest .global latfaddtest .global latfmultest .global latvecfadd128test .global latvecfmul128test .global vecfadd128test .global vecfmul128test .global mixvecfaddfmul128test .global mixaddvecadd128test .global mix3to1addvecadd128test .global mix1to1addvecadd128test .global mixmulvecmultest .global mixvecmulfmultest .global mixvecaddfaddtest .global mixjmpvecaddtest .global mixjmpvecmultest .global vecfma128test .global latvecfma128test .global scalarfmatest .global latscalarfmatest .global aesetest .global mixaesevecadd128test .global pmulltest .global mixpmulladd128test .global _vecadd128test .global _latvecadd128test .global _vecmul128test .global _latvecmul128test .global _mixvecaddmul128test .global _faddtest .global _latfaddtest .global _latfmultest .global _latvecfadd128test .global _latvecfmul128test .global _vecfadd128test .global _vecfmul128test .global _mixvecfaddfmul128test .global _mixaddvecadd128test .global _mix3to1addvecadd128test .global _mix1to1addvecadd128test .global _mixmulvecmultest .global _mixvecmulfmultest .global _mixvecaddfaddtest .global _mixjmpvecaddtest .global _mixjmpvecmultest .global _vecfma128test .global _latvecfma128test .global _scalarfmatest .global _latscalarfmatest .global mixvecfaddfma128test .global mixvecfmulfma128test .global loadtest .global mixloadstoretest .global mix21loadstoretest .global vecloadtest .global vecstoretest .global _mixvecfaddfma128test .global _mixvecfmulfma128test .global _loadtest .global _mixloadstoretest .global _mix21loadstoretest .global _vecloadtest .global _vecstoretest //renamer tests .global indepmovtest .global depmovtest .global xorzerotest .global movzerotest .global subzerotest .global _indepmovtest .global _depmovtest .global _xorzerotest .global _movzerotest .global _subzerotest .global _aesetest .global _mixaesevecadd128test .global _pmulltest .global _mixpmulladd128test .balign 4 /* x0 = arg = iteration count. all iteration counts must be divisible by 10 */ _clktest: clktest: sub sp, sp, #0x30 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] mov x15, 1 mov x14, 20 eor x13, x13, x13 clktest_loop: add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 add x13, x13, x15 sub x0, x0, x14 cbnz x0, clktest_loop ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x30 ret _noptest: noptest: sub sp, sp, #0x30 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] mov x15, 1 mov x14, 30 eor x13, x13, x13 noptest_loop: nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop sub x0, x0, x14 cbnz x0, noptest_loop ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x30 ret _addtest: addtest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 30 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 addtest_loop: add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add x9, x9, x15 sub x0, x0, x14 cbnz x0, addtest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _maddaddtest: maddaddtest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 20 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 mov x10, 2 eor x9, x9, x9 mov x8, 3 maddaddtest_loop: add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 madd x10, x8, x0, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 madd x10, x8, x0, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 madd x10, x8, x0, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 madd x10, x8, x0, x15 add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 madd x10, x8, x0, x15 sub x0, x0, x14 cbnz x0, maddaddtest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _eortest: eortest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 30 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 eortest_loop: eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 eor x13, x13, x15 eor x12, x12, x15 eor x11, x11, x15 eor x10, x10, x15 eor x9, x9, x15 sub x0, x0, x14 cbnz x0, eortest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _cmptest: cmptest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 30 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 cmptest_loop: cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 cmp x13, x13 cmp x12, x12 cmp x11, x11 cmp x10, x10 cmp x9, x9 sub x0, x0, x14 cbnz x0, cmptest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _addmultest: addmultest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 20 mov x13, 2 eor x12, x12, x12 mov x11, 2 eor x10, x10, x10 mov x9, 2 mov x8, 2 addmultest_loop: mul w13, w13, w15 add x12, x12, x15 mul w11, w11, w15 add x10, x10, x15 mul w9, w9, w15 add x12, x12, x15 mul w8, w8, w15 add x10, x10, x15 mul w13, w13, w15 add x12, x12, x15 mul w11, w11, w15 add x10, x10, x15 mul w9, w9, w15 add x12, x12, x15 mul w8, w8, w15 add x10, x10, x15 mul w13, w13, w15 add x12, x12, x15 mul w11, w11, w15 add x10, x10, x15 sub x0, x0, x14 cbnz x0, addmultest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _addmul21test: addmul21test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 24 mov x13, 2 eor x12, x12, x12 mov x11, 2 eor x10, x10, x10 mov x9, 2 mov x8, 2 addmul21test_loop: mul w13, w13, w15 add x12, x12, x15 add x10, x10, x15 mul w11, w11, w15 add x12, x12, x15 add x10, x10, x15 mul w9, w9, w15 add x12, x12, x15 add x10, x10, x15 mul w8, w8, w15 add x12, x12, x15 add x10, x10, x15 mul w13, w13, w15 add x12, x12, x15 add x10, x10, x15 mul w11, w11, w15 add x12, x12, x15 add x10, x10, x15 mul w9, w9, w15 add x12, x12, x15 add x10, x10, x15 mul w8, w8, w15 add x12, x12, x15 add x10, x10, x15 sub x0, x0, x14 cmp x0, 0 b.gt addmul21test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mul32test: mul32test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 20 mov x13, x15 mov x12, x15 mov x11, x15 mov x10, x15 mov x9, x15 mov x8, x15 mul32test_loop: mul w13, w13, w15 mul w12, w12, w15 mul w11, w11, w15 mul w10, w10, w15 mul w9, w9, w15 mul w8, w8, w15 mul w13, w13, w15 mul w12, w12, w15 mul w11, w11, w15 mul w10, w10, w15 mul w9, w9, w15 mul w8, w8, w15 mul w13, w13, w15 mul w12, w12, w15 mul w11, w11, w15 mul w10, w10, w15 mul w9, w9, w15 mul w8, w8, w15 mul w13, w13, w15 mul w12, w12, w15 sub x0, x0, x14 cbnz x0, mul32test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mul64test: mul64test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 20 mov x13, x15 mov x12, x15 mov x11, x15 mov x10, x15 mov x9, x15 mov x8, x15 mul64test_loop: mul x13, x13, x15 mul x12, x12, x15 mul x11, x11, x15 mul x10, x10, x15 mul x9, x9, x15 mul x8, x8, x15 mul x13, x13, x15 mul x12, x12, x15 mul x11, x11, x15 mul x10, x10, x15 mul x9, x9, x15 mul x8, x8, x15 mul x13, x13, x15 mul x12, x12, x15 mul x11, x11, x15 mul x10, x10, x15 mul x9, x9, x15 mul x8, x8, x15 mul x13, x13, x15 mul x12, x12, x15 sub x0, x0, x14 cbnz x0, mul64test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _latmul64test: latmul64test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 20 mov x13, x15 latmul64test_loop: mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 mul x13, x13, x13 sub x0, x0, x14 cbnz x0, latmul64test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret /* needs an additional parameter passed in x1 - ptr to array of 4 floats */ _vecadd128test: vecadd128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] vecadd128test_loop: add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s add v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s add v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s add v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, vecadd128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latvecadd128test: latvecadd128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latvecadd128test_loop: add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s add v16.4s, v16.4s, v16.4s sub x0, x0, x14 cbnz x0, latvecadd128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _faddtest: faddtest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr s16, [x1] ldr s17, [x1, #0x4] ldr s18, [x1, #0x8] ldr s19, [x1, #0xC] ldr s20, [x1] ldr s21, [x1, #0x4] faddtest_loop: fadd s16, s16, s16 fadd s17, s17, s17 fadd s18, s18, s18 fadd s19, s19, s19 fadd s20, s20, s20 fadd s21, s21, s21 fadd s16, s16, s16 fadd s17, s17, s17 fadd s18, s18, s18 fadd s19, s19, s19 fadd s20, s20, s20 fadd s21, s21, s21 fadd s16, s16, s16 fadd s17, s17, s17 fadd s18, s18, s18 fadd s19, s19, s19 fadd s20, s20, s20 fadd s21, s21, s21 fadd s16, s16, s16 fadd s17, s17, s17 sub x0, x0, x14 cbnz x0, faddtest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latfaddtest: latfaddtest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr s16, [x1] latfaddtest_loop: fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 fadd s16, s16, s16 sub x0, x0, x14 cbnz x0, latfaddtest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latfmultest: latfmultest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr s16, [x1] latfmultest_loop: fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 fmul s16, s16, s16 sub x0, x0, x14 cbnz x0, latfmultest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latvecmul128test: latvecmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latvecmul128test_loop: mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s mul v16.4s, v16.4s, v16.4s sub x0, x0, x14 cbnz x0, latvecmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecmul128test: vecmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] vecmul128test_loop: mul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s mul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s mul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s mul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s mul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s mul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s mul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, vecmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _scalarfmatest: scalarfmatest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] ldr q24, [x1] ldr q25, [x1] scalarfmatest_loop: fmadd s16, s16, s16, s16 fmadd s17, s17, s17, s17 fmadd s18, s18, s18, s18 fmadd s19, s19, s19, s19 fmadd s20, s20, s20, s20 fmadd s21, s21, s21, s21 fmadd s22, s22, s22, s22 fmadd s23, s23, s23, s23 fmadd s24, s24, s24, s24 fmadd s25, s25, s25, s25 fmadd s16, s16, s16, s16 fmadd s17, s17, s17, s17 fmadd s18, s18, s18, s18 fmadd s19, s19, s19, s19 fmadd s20, s20, s20, s20 fmadd s21, s21, s21, s21 fmadd s22, s22, s22, s22 fmadd s23, s23, s23, s23 fmadd s24, s24, s24, s24 fmadd s25, s25, s25, s25 sub x0, x0, x14 cbnz x0, scalarfmatest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latscalarfmatest: latscalarfmatest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latscalarfmatest_loop: fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 fmadd s16, s16, s16, s16 sub x0, x0, x14 cbnz x0, latscalarfmatest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecfma128test: vecfma128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] ldr q24, [x1] ldr q25, [x1] vecfma128test_loop: fmla v16.4s, v16.4s, v16.4s fmla v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fmla v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fmla v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fmla v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fmla v25.4s, v25.4s, v25.4s fmla v16.4s, v16.4s, v16.4s fmla v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fmla v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fmla v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fmla v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fmla v25.4s, v25.4s, v25.4s sub x0, x0, x14 cbnz x0, vecfma128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixvecfmulfma128test: mixvecfmulfma128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] ldr q24, [x1] ldr q25, [x1] mixvecfmulfma128test_loop: fmla v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fmul v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fmul v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fmul v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fmul v25.4s, v25.4s, v25.4s fmla v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fmul v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fmul v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fmul v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fmul v25.4s, v25.4s, v25.4s sub x0, x0, x14 cbnz x0, mixvecfmulfma128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixvecfaddfma128test: mixvecfaddfma128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] ldr q24, [x1] ldr q25, [x1] mixvecfaddfma128test_loop: fmla v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fadd v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fadd v25.4s, v25.4s, v25.4s fmla v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fmla v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fmla v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fmla v22.4s, v22.4s, v22.4s fadd v23.4s, v23.4s, v23.4s fmla v24.4s, v24.4s, v24.4s fadd v25.4s, v25.4s, v25.4s sub x0, x0, x14 cbnz x0, mixvecfaddfma128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latvecfma128test: latvecfma128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latvecfma128test_loop: fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s fmla v16.4s, v16.4s, v16.4s sub x0, x0, x14 cbnz x0, latvecfma128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecfadd128test: vecfadd128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] vecfadd128test_loop: fadd v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, vecfadd128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latvecfadd128test: latvecfadd128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latvecfadd128test_loop: fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s fadd v16.4s, v16.4s, v16.4s sub x0, x0, x14 cbnz x0, latvecfadd128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecfmul128test: vecfmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] vecfmul128test_loop: fmul v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fmul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s fmul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fmul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s fmul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fmul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s fmul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s fmul v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, vecfmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _latvecfmul128test: latvecfmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] latvecfmul128test_loop: fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s fmul v16.4s, v16.4s, v16.4s sub x0, x0, x14 cbnz x0, latvecfmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixvecfaddfmul128test: mixvecfaddfmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mixvecfaddfmul128test_loop: fmul v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fmul v22.4s, v22.4s, v22.4s fadd v23.4s, v23.4s, v23.4s fmul v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s fadd v21.4s, v21.4s, v21.4s fmul v22.4s, v22.4s, v22.4s fadd v23.4s, v23.4s, v23.4s fmul v16.4s, v16.4s, v16.4s fadd v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s fadd v19.4s, v19.4s, v19.4s sub x0, x0, x14 cbnz x0, mixvecfaddfmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixvecaddmul128test: mixvecaddmul128test: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mixvecaddmul128test_loop: mul v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s mul v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s mul v22.4s, v22.4s, v22.4s add v23.4s, v23.4s, v23.4s mul v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s mul v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s mul v22.4s, v22.4s, v22.4s add v23.4s, v23.4s, v23.4s mul v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s mul v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s sub x0, x0, x14 cbnz x0, mixvecaddmul128test_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixaddvecadd128test: mixaddvecadd128test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 30 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 eor x8, x8, x8 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mixaddvecadd128test_loop: add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add x10, x10, x15 add v22.4s, v22.4s, v22.4s add v23.4s, v23.4s, v23.4s sub x0, x0, x14 cbnz x0, mixaddvecadd128test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mix3to1addvecadd128test: mix3to1addvecadd128test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 40 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 eor x8, x8, x8 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mix3to1addvecadd128test_loop: add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v16.4s, v16.4s, v16.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v17.4s, v17.4s, v17.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v20.4s, v20.4s, v20.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v16.4s, v16.4s, v16.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v17.4s, v17.4s, v17.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add x12, x12, x15 add x11, x11, x15 add v20.4s, v20.4s, v20.4s sub x0, x0, x14 cbnz x0, mix3to1addvecadd128test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mix1to1addvecadd128test: mix1to1addvecadd128test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 40 eor x13, x13, x13 eor x12, x12, x12 eor x11, x11, x11 eor x10, x10, x10 eor x9, x9, x9 eor x8, x8, x8 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mix1to1addvecadd128test_loop: add x13, x13, x15 add v16.4s, v16.4s, v16.4s add x12, x12, x15 add v17.4s, v17.4s, v17.4s add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x10, x10, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add v16.4s, v16.4s, v16.4s add x12, x12, x15 add v17.4s, v17.4s, v17.4s add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x10, x10, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add v16.4s, v16.4s, v16.4s add x12, x12, x15 add v17.4s, v17.4s, v17.4s add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x10, x10, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add v16.4s, v16.4s, v16.4s add x12, x12, x15 add v17.4s, v17.4s, v17.4s add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x10, x10, x15 add v19.4s, v19.4s, v19.4s add x13, x13, x15 add v16.4s, v16.4s, v16.4s add x12, x12, x15 add v17.4s, v17.4s, v17.4s add x11, x11, x15 add v18.4s, v18.4s, v18.4s add x10, x10, x15 add v19.4s, v19.4s, v19.4s sub x0, x0, x14 cbnz x0, mix1to1addvecadd128test_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixmulvecmultest: mixmulvecmultest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 2 mov x14, 20 mov x13, x15 mov x12, x15 mov x11, x15 mov x10, x15 mov x9, x15 mov x8, x15 mov x7, x15 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] ldr q22, [x1] ldr q23, [x1] mixmulvecmultest_loop: mul w8, w8, w15 mul v16.4s, v16.4s, v16.4s mul w9, w9, w15 mul v17.4s, v17.4s, v17.4s mul w10, w10, w15 mul v18.4s, v18.4s, v18.4s mul w11, w11, w15 mul v19.4s, v19.4s, v19.4s mul w12, w12, w15 mul v20.4s, v20.4s, v20.4s mul w8, w8, w15 mul v16.4s, v16.4s, v16.4s mul w9, w9, w15 mul v17.4s, v17.4s, v17.4s mul w10, w10, w15 mul v18.4s, v18.4s, v18.4s mul w11, w11, w15 mul v19.4s, v19.4s, v19.4s mul w12, w12, w15 mul v20.4s, v20.4s, v20.4s sub x0, x0, x14 cbnz x0, mixmulvecmultest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixvecmulfmultest: mixvecmulfmultest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x2] ldr q18, [x1] ldr q19, [x2] ldr q20, [x1] ldr q21, [x2] mixvecmulfmultest_loop: fmul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s fmul v18.4s, v18.4s, v18.4s mul v19.4s, v19.4s, v19.4s fmul v20.4s, v20.4s, v20.4s mul v21.4s, v21.4s, v21.4s fmul v16.4s, v16.4s, v16.4s mul v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, mixvecmulfmultest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixvecaddfaddtest: mixvecaddfaddtest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x2] ldr q18, [x1] ldr q19, [x2] ldr q20, [x1] ldr q21, [x2] mixvecaddfaddtest_loop: fadd v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s fadd v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s fadd v20.4s, v20.4s, v20.4s add v21.4s, v21.4s, v21.4s fadd v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s sub x0, x0, x14 cbnz x0, mixvecaddfaddtest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixjmpvecaddtest: mixjmpvecaddtest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 30 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] mixjmpvecaddtest_loop: add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s cbz x0, mixjmpvecaddtest_jellydonut add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s cbz x0, mixjmpvecaddtest_jellydonut add v20.4s, v20.4s, v20.4s add v16.4s, v16.4s, v16.4s cbz x0, mixjmpvecaddtest_jellydonut add v17.4s, v17.4s, v17.4s add v18.4s, v18.4s, v18.4s cbz x0, mixjmpvecaddtest_jellydonut add v19.4s, v19.4s, v19.4s add v20.4s, v20.4s, v20.4s cbz x0, mixjmpvecaddtest_jellydonut add v16.4s, v16.4s, v16.4s add v17.4s, v17.4s, v17.4s cbz x0, mixjmpvecaddtest_jellydonut add v18.4s, v18.4s, v18.4s add v19.4s, v19.4s, v19.4s cbz x0, mixjmpvecaddtest_jellydonut add v20.4s, v20.4s, v20.4s add v16.4s, v16.4s, v16.4s cbz x0, mixjmpvecaddtest_jellydonut add v17.4s, v17.4s, v17.4s add v18.4s, v18.4s, v18.4s cbz x0, mixjmpvecaddtest_jellydonut add v19.4s, v19.4s, v19.4s add v20.4s, v20.4s, v20.4s cbz x0, mixjmpvecaddtest_jellydonut sub x0, x0, x14 cbnz x0, mixjmpvecaddtest_loop mixjmpvecaddtest_jellydonut: ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixjmpvecmultest: mixjmpvecmultest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] mixjmpvecmultest_loop: mul v16.4s, v16.4s, v16.4s cbz x0, mixjmpvecmultest_jellydonut mul v17.4s, v17.4s, v17.4s cbz x0, mixjmpvecmultest_jellydonut mul v18.4s, v18.4s, v18.4s cbz x0, mixjmpvecmultest_jellydonut mul v19.4s, v19.4s, v19.4s cbz x0, mixjmpvecmultest_jellydonut mul v20.4s, v20.4s, v20.4s cbz x0, mixjmpvecmultest_jellydonut mul v16.4s, v16.4s, v16.4s cbz x0, mixjmpvecmultest_jellydonut mul v17.4s, v17.4s, v17.4s cbz x0, mixjmpvecmultest_jellydonut mul v18.4s, v18.4s, v18.4s cbz x0, mixjmpvecmultest_jellydonut mul v19.4s, v19.4s, v19.4s cbz x0, mixjmpvecmultest_jellydonut mul v20.4s, v20.4s, v20.4s cbz x0, mixjmpvecmultest_jellydonut sub x0, x0, x14 cbnz x0, mixjmpvecmultest_loop mixjmpvecmultest_jellydonut: ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecloadtest: vecloadtest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 vecloadtest_loop: ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] sub x0, x0, x14 cbnz x0, vecloadtest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _vecstoretest: vecstoretest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] vecstoretest_loop: str q16, [x2] str q17, [x2] str q18, [x2] str q19, [x2] str q20, [x2] str q16, [x2] str q17, [x2] str q18, [x2] str q19, [x2] str q20, [x2] str q16, [x2] str q17, [x2] str q18, [x2] str q19, [x2] str q20, [x2] str q16, [x2] str q17, [x2] str q18, [x2] str q19, [x2] str q20, [x2] sub x0, x0, x14 cbnz x0, vecstoretest_loop ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _loadtest: loadtest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x14, 20 loadtest_loop: ldr x10, [x1] ldr x11, [x1] ldr x12, [x1] ldr x13, [x1] ldr x15, [x1] ldr x10, [x1] ldr x11, [x1] ldr x12, [x1] ldr x13, [x1] ldr x15, [x1] ldr x10, [x1] ldr x11, [x1] ldr x12, [x1] ldr x13, [x1] ldr x15, [x1] ldr x10, [x1] ldr x11, [x1] ldr x12, [x1] ldr x13, [x1] ldr x15, [x1] sub x0, x0, x14 cbnz x0, loadtest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixloadstoretest: mixloadstoretest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x14, 20 mixloadstoretest_loop: ldr x10, [x1] str x14, [x2] ldr x11, [x1] str x14, [x2] ldr x12, [x1] str x14, [x2] ldr x13, [x1] str x14, [x2] ldr x15, [x1] str x14, [x2] ldr x10, [x1] str x14, [x2] ldr x11, [x1] str x14, [x2] ldr x12, [x1] str x14, [x2] ldr x13, [x1] str x14, [x2] ldr x15, [x1] str x14, [x2] sub x0, x0, x14 cbnz x0, mixloadstoretest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mix21loadstoretest: mix21loadstoretest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x14, 30 mix21loadstoretest_loop: ldr x10, [x1] ldr x11, [x1] str x14, [x2] ldr x12, [x1] ldr x13, [x1] str x14, [x2] ldr x10, [x1] ldr x11, [x1] str x14, [x2] ldr x12, [x1] ldr x13, [x1] str x14, [x2] ldr x10, [x1] ldr x11, [x1] str x14, [x2] ldr x12, [x1] ldr x13, [x1] str x14, [x2] ldr x10, [x1] ldr x11, [x1] str x14, [x2] ldr x12, [x1] ldr x13, [x1] str x14, [x2] ldr x10, [x1] ldr x11, [x1] str x14, [x2] ldr x12, [x1] ldr x13, [x1] str x14, [x2] sub x0, x0, x14 cbnz x0, mix21loadstoretest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _jmptest: jmptest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 jmptest_loop: cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut cbz x0, jmptest_jellydonut sub x0, x0, x14 cbnz x0, jmptest_loop jmptest_jellydonut: ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _fusejmptest: fusejmptest: sub sp, sp, #0x20 stp x14, x15, [sp, #0x10] mov x14, 20 fusejmptest_loop: nop nop cmp x0, 0 b.eq jmptest_jellydonut nop nop cmp x0, 0 b.eq jmptest_jellydonut nop nop cmp x0, 0 b.eq jmptest_jellydonut nop nop cmp x0, 0 b.eq jmptest_jellydonut nop sub x0, x0, x14 cmp x0, 0 b.ne fusejmptest_loop fusejmptest_jellydonut: ldp x14, x15, [sp, #0x10] add sp, sp, #0x20 ret _mixmuljmptest: mixmuljmptest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x14, 20 mixmuljmptest_loop: mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 mul x9, x9, x15 mul x8, x8, x15 mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut sub x0, x0, x14 cbnz x0, mixmuljmptest_loop mixmuljmptest_jellydonut: ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixmuljmptest21: mixmuljmptest21: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x14, 30 mixmuljmptest21_loop: mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 mul x9, x9, x15 mul x8, x8, x15 mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut cbz x0, mixmuljmptest21_jellydonut mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 mul x9, x9, x15 mul x8, x8, x15 mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 sub x0, x0, x14 cbnz x0, mixmuljmptest21_loop mixmuljmptest21_jellydonut: ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixaddjmptest: mixaddjmptest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x14, 20 mixaddjmptest_loop: add x10, x10, x15 add x11, x11, x15 add x12, x12, x15 add x13, x13, x15 add x9, x9, x15 add x8, x8, x15 add x10, x10, x15 add x11, x11, x15 add x12, x12, x15 add x13, x13, x15 cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixaddjmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut cbz x0, mixmuljmptest_jellydonut sub x0, x0, x14 cbnz x0, mixmuljmptest_loop mixaddjmptest_jellydonut: ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixaddjmp21test: mixaddjmp21test: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x14, 15 mixaddjmp21test_loop: add x10, x10, x15 add x11, x11, x15 cbz x0, mixaddjmp21test_jellydonut add x12, x12, x15 add x13, x13, x15 cbz x0, mixaddjmp21test_jellydonut add x9, x9, x15 add x8, x8, x15 cbz x0, mixaddjmp21test_jellydonut add x10, x10, x15 add x11, x11, x15 cbz x0, mixaddjmp21test_jellydonut add x12, x12, x15 add x13, x13, x15 cbz x0, mixaddjmp21test_jellydonut sub x0, x0, x14 cmp x0, 0 b.gt mixaddjmp21test_loop mixaddjmp21test_jellydonut: ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _mixmulrortest: mixmulrortest: sub sp, sp, #0x80 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] stp x19, x20, [sp, #0x50] stp x21, x22, [sp, #0x60] stp x23, x24, [sp, #0x70] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x19, x8 mov x20, x8 mov x21, x8 mov x22, x8 mov x23, x8 mov x24, x8 mov x14, 20 mixmulrortest_loop: ror x24, x24, 1 ror x23, x23, 1 ror x22, x22, 1 ror x21, x21, 1 ror x20, x20, 1 mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 mul x9, x9, x15 ror x24, x24, 1 ror x23, x23, 1 ror x22, x22, 1 ror x21, x21, 1 ror x20, x20, 1 mul x8, x8, x15 mul x10, x10, x15 mul x11, x11, x15 mul x12, x12, x15 mul x13, x13, x15 sub x0, x0, x14 cbnz x0, mixmulrortest_loop ldp x23, x24, [sp, #0x70] ldp x21, x22, [sp, #0x60] ldp x19, x20, [sp, #0x50] ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x80 ret _rortest: rortest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x8, 7 mov x9, 6 mov x10, 1 mov x11, 2 mov x12, 3 mov x13, 4 mov x15, 5 mov x14, 20 rortest_loop: ror x10, x10, 1 ror x11, x11, 1 ror x12, x12, 1 ror x13, x13, 1 ror x9, x9, 1 ror x8, x8, 1 ror x10, x10, 1 ror x11, x11, 1 ror x12, x12, 1 ror x13, x13, 1 ror x10, x10, 1 ror x11, x11, 1 ror x12, x12, 1 ror x13, x13, 1 ror x9, x9, 1 ror x8, x8, 1 ror x10, x10, 1 ror x11, x11, 1 ror x12, x12, 1 ror x13, x13, 1 sub x0, x0, x14 cbnz x0, rortest_loop rortest_jellydonut: ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _depmovtest: depmovtest: sub sp, sp, #0x40 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] mov x15, 1 mov x14, 20 eor x13, x13, x13 depmovtest_loop: mov x12, x15 mov x10, x12 mov x13, x10 mov x11, x13 mov x15, x11 mov x12, x15 mov x10, x12 mov x13, x10 mov x11, x13 mov x15, x11 mov x12, x15 mov x10, x12 mov x13, x10 mov x11, x13 mov x15, x11 mov x12, x15 mov x10, x12 mov x13, x10 mov x11, x13 mov x15, x11 sub x0, x0, x14 cbnz x0, depmovtest_loop ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x40 ret _indepmovtest: indepmovtest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 20 eor x13, x13, x13 indepmovtest_loop: mov x10, x15 mov x11, x14 mov x12, x13 mov x9, x15 mov x8, x14 mov x10, x15 mov x11, x14 mov x12, x13 mov x9, x15 mov x8, x14 mov x10, x15 mov x11, x14 mov x12, x13 mov x9, x15 mov x8, x14 mov x10, x15 mov x11, x14 mov x12, x13 mov x9, x15 mov x8, x14 sub x0, x0, x14 cbnz x0, indepmovtest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _xorzerotest: xorzerotest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 20 xorzerotest_loop: eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 eor x15, x15, x15 sub x0, x0, x14 cbnz x0, xorzerotest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _movzerotest: movzerotest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 20 movzerotest_loop: mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 mov x15, 0 sub x0, x0, x14 cbnz x0, movzerotest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _subzerotest: subzerotest: sub sp, sp, #0x50 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x10, x11, [sp, #0x30] stp x8, x9, [sp, #0x40] mov x15, 1 mov x14, 20 subzerotest_loop: sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x15, x15, x15 sub x0, x0, x14 cbnz x0, subzerotest_loop ldp x8, x9, [sp, #0x40] ldp x10, x11, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x50 ret _aesetest: aesetest: sub sp, sp, #0x50 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] mov x14, 20 aesetest_loop: aese v0.16b, v16.16b aese v1.16b, v17.16b aese v2.16b, v18.16b aese v3.16b, v19.16b aese v4.16b, v20.16b aese v0.16b, v16.16b aese v1.16b, v17.16b aese v2.16b, v18.16b aese v3.16b, v19.16b aese v4.16b, v20.16b aese v0.16b, v16.16b aese v1.16b, v17.16b aese v2.16b, v18.16b aese v3.16b, v19.16b aese v4.16b, v20.16b aese v0.16b, v16.16b aese v1.16b, v17.16b aese v2.16b, v18.16b aese v3.16b, v19.16b aese v4.16b, v20.16b sub x0, x0, x14 cbnz x0, aesetest_loop add sp, sp, #0x50 ret _mixaesevecadd128test: mixaesevecadd128test: sub sp, sp, #0x50 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] mov x14, 20 mixaesevecadd128test_loop: aese v0.16b, v16.16b add v5.4s, v9.4s, v16.4s aese v1.16b, v17.16b add v6.4s, v10.4s, v16.4s aese v2.16b, v18.16b add v7.4s, v11.4s, v16.4s aese v3.16b, v19.16b add v31.4s, v12.4s, v16.4s aese v4.16b, v20.16b add v30.4s, v13.4s, v16.4s aese v0.16b, v16.16b add v5.4s, v9.4s, v16.4s aese v1.16b, v17.16b add v6.4s, v10.4s, v16.4s aese v2.16b, v18.16b add v7.4s, v11.4s, v16.4s aese v3.16b, v19.16b add v31.4s, v12.4s, v16.4s aese v4.16b, v20.16b add v30.4s, v13.4s, v16.4s sub x0, x0, x14 cbnz x0, mixaesevecadd128test_loop add sp, sp, #0x50 ret _pmulltest: pmulltest: sub sp, sp, #0x50 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] mov x14, 20 pmulltest_loop: pmull v0.1q, v16.1d, v17.1d pmull v1.1q, v16.1d, v17.1d pmull v2.1q, v16.1d, v17.1d pmull v3.1q, v16.1d, v17.1d pmull v4.1q, v16.1d, v17.1d pmull v0.1q, v16.1d, v17.1d pmull v1.1q, v16.1d, v17.1d pmull v2.1q, v16.1d, v17.1d pmull v3.1q, v16.1d, v17.1d pmull v4.1q, v16.1d, v17.1d pmull v0.1q, v16.1d, v17.1d pmull v1.1q, v16.1d, v17.1d pmull v2.1q, v16.1d, v17.1d pmull v3.1q, v16.1d, v17.1d pmull v4.1q, v16.1d, v17.1d pmull v0.1q, v16.1d, v17.1d pmull v1.1q, v16.1d, v17.1d pmull v2.1q, v16.1d, v17.1d pmull v3.1q, v16.1d, v17.1d pmull v4.1q, v16.1d, v17.1d sub x0, x0, x14 cbnz x0, pmulltest_loop add sp, sp, #0x50 ret _mixpmulladd128test: mixpmulladd128test: sub sp, sp, #0x50 ldr q16, [x1] ldr q17, [x1] ldr q18, [x1] ldr q19, [x1] ldr q20, [x1] ldr q21, [x1] mov x14, 20 mixpmulladd128test_loop: pmull v0.1q, v16.1d, v17.1d add v5.4s, v9.4s, v16.4s pmull v1.1q, v16.1d, v17.1d add v6.4s, v9.4s, v16.4s pmull v2.1q, v16.1d, v17.1d add v7.4s, v9.4s, v16.4s pmull v3.1q, v16.1d, v17.1d add v31.4s, v9.4s, v16.4s pmull v4.1q, v16.1d, v17.1d add v30.4s, v9.4s, v16.4s pmull v0.1q, v16.1d, v17.1d add v5.4s, v9.4s, v16.4s pmull v1.1q, v16.1d, v17.1d add v6.4s, v9.4s, v16.4s pmull v2.1q, v16.1d, v17.1d add v7.4s, v9.4s, v16.4s pmull v3.1q, v16.1d, v17.1d add v31.4s, v9.4s, v16.4s pmull v4.1q, v16.1d, v17.1d add v30.4s, v9.4s, v16.4s sub x0, x0, x14 cbnz x0, mixpmulladd128test_loop add sp, sp, #0x50 ret ================================================ FILE: InstructionRate/riscv_instructionrate.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include #include #include float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *)); extern uint64_t clktest(uint64_t iterations, void *data); extern uint64_t addtest(uint64_t iterations, void *data); extern uint64_t faddtest(uint64_t iterations, void *data); extern uint64_t fmultest(uint64_t iterations, void *data); extern uint64_t mixfaddfmultest(uint64_t iterations, void *data); extern uint64_t fmatest(uint64_t iterations, void *data); extern uint64_t faddlattest(uint64_t iterations, void *data); extern uint64_t fmullattest(uint64_t iterations, void *data); extern uint64_t fmalattest(uint64_t iterations, void *data); float fpTestArr[4] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14 }; int intTestArr[4] __attribute__ ((aligned (64))) = { 1, 2, 3, 4 }; int sinkArr[4] __attribute__ ((aligned (64))) = { 2, 3, 4, 5 }; int main(int argc, char *argv[]) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t iterations = 1500000000; uint64_t iterationsHigh = iterations * 5; uint64_t time_diff_ms; float latency, opsPerNs, clockSpeedGhz; if (argc > 1) { for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "iter", 4) == 0) { argIdx++; int iterMul = atoi(argv[argIdx]); iterations *= iterMul; iterationsHigh *= iterMul; fprintf(stderr, "Scaled iterations by %d\n", iterMul); } } } } gettimeofday(&startTv, &startTz); clktest(iterations, NULL); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; // clk speed should be 1/latency, assuming we got one add per clk, roughly clockSpeedGhz = 1/latency; printf("Estimated clock speed> %.2f GHz\n", clockSpeedGhz); // integer side printf("Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, NULL, addtest)); // FP printf("FP32 Adds per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, faddtest)); printf("FP32 Add latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, faddlattest)); printf("FP32 Multiplies per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmultest)); printf("FP32 Multiply latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmullattest)); printf("1:1 FP32 Add:Mul per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, mixfaddfmultest)); printf("FP32 FMA per clk> %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fpTestArr, fmatest)); printf("FP32 FMA latency> %.2f cycles\n", 1 / measureFunction(iterations, clockSpeedGhz, fpTestArr, fmalattest)); return 0; } float measureFunction(uint64_t iterations, float clockSpeedGhz, void *arr, uint64_t (*testfunc)(uint64_t, void *)) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t time_diff_ms, retval; float latency, opsPerNs; gettimeofday(&startTv, &startTz); retval = testfunc(iterations, arr); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; opsPerNs = 1/latency; //printf("return value: %lu\n", retval); return opsPerNs / clockSpeedGhz; } ================================================ FILE: InstructionRate/riscv_instructionrate.s ================================================ .text .global clktest .global addtest .global faddtest .global fmultest .global mixfaddfmultest .global fmatest .global faddlattest .global fmullattest .global fmalattest /* a0 = iterations, a1 = data arr */ clktest: mv t0, x0 mv t1, x0 addi t1, t1, 1 clktest_loop: add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 add t0, t0, t1 addi a0, a0, -20 blt x0, a0, clktest_loop ret addtest: mv t0, x0 addi t0, t0, 1 mv t1, t0 mv t2, t0 mv t3, t0 mv t4, t0 mv t5, t0 mv t6, t0 addtest_loop: add t1, t1, t6 add t2, t2, t6 add t3, t3, t6 add t4, t4, t6 add t5, t5, t6 add t1, t1, t6 add t2, t2, t6 add t3, t3, t6 add t4, t4, t6 add t5, t5, t6 add t1, t1, t6 add t2, t2, t6 add t3, t3, t6 add t4, t4, t6 add t5, t5, t6 add t1, t1, t6 add t2, t2, t6 add t3, t3, t6 add t4, t4, t6 add t5, t5, t6 addi a0, a0, -20 blt x0, a0, addtest_loop ret /* f0-7 are fp temporaries */ faddtest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 faddtest_loop: fadd.d f1, f1, f0 fadd.d f2, f2, f0 fadd.d f3, f3, f0 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 fadd.d f1, f1, f0 fadd.d f2, f2, f0 fadd.d f3, f3, f0 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 addi a0, a0, -14 blt x0, a0, faddtest_loop ret faddlattest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 faddlattest_loop: fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 fadd.d f1, f1, f1 addi a0, a0, -14 blt x0, a0, faddlattest_loop ret fmultest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 fmultest_loop: fmul.d f1, f1, f0 fmul.d f2, f2, f0 fmul.d f3, f3, f0 fmul.d f4, f4, f0 fmul.d f5, f5, f0 fmul.d f6, f6, f0 fmul.d f7, f7, f0 fmul.d f1, f1, f0 fmul.d f2, f2, f0 fmul.d f3, f3, f0 fmul.d f4, f4, f0 fmul.d f5, f5, f0 fmul.d f6, f6, f0 fmul.d f7, f7, f0 addi a0, a0, -14 blt x0, a0, fmultest_loop ret fmullattest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 fmullattest_loop: fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 fmul.d f1, f1, f1 addi a0, a0, -14 blt x0, a0, fmullattest_loop ret mixfaddfmultest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 mixfaddfmultest_loop: fadd.d f1, f1, f0 fmul.d f2, f2, f0 fadd.d f3, f3, f0 fmul.d f4, f4, f0 fadd.d f5, f5, f0 fmul.d f6, f6, f0 fadd.d f7, f7, f0 fmul.d f1, f1, f0 fadd.d f2, f2, f0 fmul.d f3, f3, f0 fadd.d f4, f4, f0 fmul.d f5, f5, f0 fadd.d f6, f6, f0 fmul.d f7, f7, f0 addi a0, a0, -14 blt x0, a0, mixfaddfmultest_loop ret fmatest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 fmatest_loop: fmadd.d f1, f1, f1, f0 fmadd.d f2, f2, f2, f0 fmadd.d f3, f3, f3, f0 fmadd.d f4, f4, f4, f0 fmadd.d f5, f5, f5, f0 fmadd.d f6, f6, f6, f0 fmadd.d f7, f7, f7, f0 fmadd.d f1, f1, f1, f0 fmadd.d f2, f2, f2, f0 fmadd.d f3, f3, f3, f0 fmadd.d f4, f4, f4, f0 fmadd.d f5, f5, f5, f0 fmadd.d f6, f6, f6, f0 fmadd.d f7, f7, f7, f0 addi a0, a0, -14 blt x0, a0, fmatest_loop ret fmalattest: flw f0, (a1) flw f1, 4(a1) flw f2, 8(a1) flw f3, 12(a1) fsub.d f4, f4, f4 fsub.d f5, f5, f5 fsub.d f6, f6, f6 fsub.d f7, f7, f7 fadd.d f4, f4, f0 fadd.d f5, f5, f0 fadd.d f6, f6, f0 fadd.d f7, f7, f0 fmalattest_loop: fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 fmadd.d f1, f1, f1, f0 addi a0, a0, -14 blt x0, a0, fmalattest_loop ret ================================================ FILE: InstructionRate/test.s ================================================ x86_instructionrate: file format elf64-x86-64 Disassembly of section .init: 0000000000001000 <_init>: 1000: f3 0f 1e fa endbr64 1004: 48 83 ec 08 sub $0x8,%rsp 1008: 48 8b 05 d9 cf 00 00 mov 0xcfd9(%rip),%rax # dfe8 <__gmon_start__> 100f: 48 85 c0 test %rax,%rax 1012: 74 02 je 1016 <_init+0x16> 1014: ff d0 callq *%rax 1016: 48 83 c4 08 add $0x8,%rsp 101a: c3 retq Disassembly of section .plt: 0000000000001020 <.plt>: 1020: ff 35 62 cf 00 00 pushq 0xcf62(%rip) # df88 <_GLOBAL_OFFSET_TABLE_+0x8> 1026: ff 25 64 cf 00 00 jmpq *0xcf64(%rip) # df90 <_GLOBAL_OFFSET_TABLE_+0x10> 102c: 0f 1f 40 00 nopl 0x0(%rax) 0000000000001030 : 1030: ff 25 62 cf 00 00 jmpq *0xcf62(%rip) # df98 1036: 68 00 00 00 00 pushq $0x0 103b: e9 e0 ff ff ff jmpq 1020 <.plt> 0000000000001040 <__stack_chk_fail@plt>: 1040: ff 25 5a cf 00 00 jmpq *0xcf5a(%rip) # dfa0 <__stack_chk_fail@GLIBC_2.4> 1046: 68 01 00 00 00 pushq $0x1 104b: e9 d0 ff ff ff jmpq 1020 <.plt> 0000000000001050 : 1050: ff 25 52 cf 00 00 jmpq *0xcf52(%rip) # dfa8 1056: 68 02 00 00 00 pushq $0x2 105b: e9 c0 ff ff ff jmpq 1020 <.plt> 0000000000001060 : 1060: ff 25 4a cf 00 00 jmpq *0xcf4a(%rip) # dfb0 1066: 68 03 00 00 00 pushq $0x3 106b: e9 b0 ff ff ff jmpq 1020 <.plt> 0000000000001070 : 1070: ff 25 42 cf 00 00 jmpq *0xcf42(%rip) # dfb8 1076: 68 04 00 00 00 pushq $0x4 107b: e9 a0 ff ff ff jmpq 1020 <.plt> 0000000000001080 <__printf_chk@plt>: 1080: ff 25 3a cf 00 00 jmpq *0xcf3a(%rip) # dfc0 <__printf_chk@GLIBC_2.3.4> 1086: 68 05 00 00 00 pushq $0x5 108b: e9 90 ff ff ff jmpq 1020 <.plt> 0000000000001090 : 1090: ff 25 32 cf 00 00 jmpq *0xcf32(%rip) # dfc8 1096: 68 06 00 00 00 pushq $0x6 109b: e9 80 ff ff ff jmpq 1020 <.plt> 00000000000010a0 : 10a0: ff 25 2a cf 00 00 jmpq *0xcf2a(%rip) # dfd0 10a6: 68 07 00 00 00 pushq $0x7 10ab: e9 70 ff ff ff jmpq 1020 <.plt> Disassembly of section .plt.got: 00000000000010b0 <__cxa_finalize@plt>: 10b0: ff 25 42 cf 00 00 jmpq *0xcf42(%rip) # dff8 <__cxa_finalize@GLIBC_2.2.5> 10b6: 66 90 xchg %ax,%ax Disassembly of section .text: 00000000000010c0
: 10c0: f3 0f 1e fa endbr64 10c4: 41 57 push %r15 10c6: 41 56 push %r14 10c8: 41 55 push %r13 10ca: 41 54 push %r12 10cc: 41 89 fc mov %edi,%r12d 10cf: bf 40 00 00 00 mov $0x40,%edi 10d4: 55 push %rbp 10d5: 48 89 f5 mov %rsi,%rbp 10d8: be 00 10 00 00 mov $0x1000,%esi 10dd: 53 push %rbx 10de: 48 83 ec 58 sub $0x58,%rsp 10e2: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax 10e9: 00 00 10eb: 48 89 44 24 48 mov %rax,0x48(%rsp) 10f0: 31 c0 xor %eax,%eax 10f2: e8 a9 ff ff ff callq 10a0 10f7: 66 0f 6f 0d 91 af 00 movdqa 0xaf91(%rip),%xmm1 # c090 <_IO_stdin_used+0x1090> 10fe: 00 10ff: 66 0f 6f 25 99 af 00 movdqa 0xaf99(%rip),%xmm4 # c0a0 <_IO_stdin_used+0x10a0> 1106: 00 1107: 48 89 05 f2 cf 00 00 mov %rax,0xcff2(%rip) # e100 110e: 66 0f 6f 1d 9a af 00 movdqa 0xaf9a(%rip),%xmm3 # c0b0 <_IO_stdin_used+0x10b0> 1115: 00 1116: 48 8d 90 00 10 00 00 lea 0x1000(%rax),%rdx 111d: 0f 1f 00 nopl (%rax) 1120: 66 0f 6f c1 movdqa %xmm1,%xmm0 1124: 48 83 c0 10 add $0x10,%rax 1128: 66 0f d4 cc paddq %xmm4,%xmm1 112c: 66 0f 6f d0 movdqa %xmm0,%xmm2 1130: 66 0f d4 d3 paddq %xmm3,%xmm2 1134: 0f c6 c2 88 shufps $0x88,%xmm2,%xmm0 1138: 0f 29 40 f0 movaps %xmm0,-0x10(%rax) 113c: 48 39 c2 cmp %rax,%rdx 113f: 75 df jne 1120 1141: 49 be 00 eb 08 bf 01 movabs $0x1bf08eb00,%r14 1148: 00 00 00 114b: 41 83 fc 02 cmp $0x2,%r12d 114f: 0f 8f db 35 00 00 jg 4730 1155: 4c 8d 2d 94 cf 00 00 lea 0xcf94(%rip),%r13 # e0f0 <__cpu_model> 115c: 41 f6 45 0d 02 testb $0x2,0xd(%r13) 1161: 0f 85 a7 35 00 00 jne 470e 1167: 41 f6 45 0d 04 testb $0x4,0xd(%r13) 116c: 0f 85 7a 35 00 00 jne 46ec 1172: 41 f6 45 0e 02 testb $0x2,0xe(%r13) 1177: 0f 85 4d 35 00 00 jne 46ca 117d: b8 07 00 00 00 mov $0x7,%eax 1182: 31 c9 xor %ecx,%ecx 1184: 0f a2 cpuid 1186: 81 e3 00 00 01 00 and $0x10000,%ebx 118c: 0f 85 af 1f 00 00 jne 3141 1192: 41 83 fc 01 cmp $0x1,%r12d 1196: 0f 84 9c 47 00 00 je 5938 119c: f2 0f 10 05 dc ae 00 movsd 0xaedc(%rip),%xmm0 # c080 <_IO_stdin_used+0x1080> 11a3: 00 11a4: bf 01 00 00 00 mov $0x1,%edi 11a9: b8 01 00 00 00 mov $0x1,%eax 11ae: 48 8d 35 c3 a5 00 00 lea 0xa5c3(%rip),%rsi # b778 <_IO_stdin_used+0x778> 11b5: e8 c6 fe ff ff callq 1080 <__printf_chk@plt> 11ba: f3 0f 10 35 c6 ae 00 movss 0xaec6(%rip),%xmm6 # c088 <_IO_stdin_used+0x1088> 11c1: 00 11c2: f3 0f 11 74 24 0c movss %xmm6,0xc(%rsp) 11c8: f3 0f 11 74 24 08 movss %xmm6,0x8(%rsp) 11ce: 41 83 fc 01 cmp $0x1,%r12d 11d2: 0f 8e 6e 17 00 00 jle 2946 11d8: 4c 8b 6d 08 mov 0x8(%rbp),%r13 11dc: ba 05 00 00 00 mov $0x5,%edx 11e1: 48 8d 35 86 a5 00 00 lea 0xa586(%rip),%rsi # b76e <_IO_stdin_used+0x76e> 11e8: 4c 89 ef mov %r13,%rdi 11eb: e8 40 fe ff ff callq 1030 11f0: 85 c0 test %eax,%eax 11f2: 0f 85 a3 17 00 00 jne 299b 11f8: 48 8d 35 79 5b 00 00 lea 0x5b79(%rip),%rsi # 6d78 11ff: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1205: 4c 89 f7 mov %r14,%rdi 1208: e8 33 98 00 00 callq aa40 120d: bf 01 00 00 00 mov $0x1,%edi 1212: b8 01 00 00 00 mov $0x1,%eax 1217: 48 8d 35 2e a5 00 00 lea 0xa52e(%rip),%rsi # b74c <_IO_stdin_used+0x74c> 121e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1222: e8 59 fe ff ff callq 1080 <__printf_chk@plt> 1227: 4c 8b 6d 08 mov 0x8(%rbp),%r13 122b: ba 05 00 00 00 mov $0x5,%edx 1230: 48 8d 35 8e 9f 00 00 lea 0x9f8e(%rip),%rsi # b1c5 <_IO_stdin_used+0x1c5> 1237: 4c 89 ef mov %r13,%rdi 123a: e8 f1 fd ff ff callq 1030 123f: 85 c0 test %eax,%eax 1241: 0f 85 70 17 00 00 jne 29b7 1247: 48 8d 35 f1 5a 00 00 lea 0x5af1(%rip),%rsi # 6d3f 124e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1254: 4c 89 f7 mov %r14,%rdi 1257: e8 e4 97 00 00 callq aa40 125c: bf 01 00 00 00 mov $0x1,%edi 1261: b8 01 00 00 00 mov $0x1,%eax 1266: 48 8d 35 c4 a4 00 00 lea 0xa4c4(%rip),%rsi # b731 <_IO_stdin_used+0x731> 126d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1271: e8 0a fe ff ff callq 1080 <__printf_chk@plt> 1276: 4c 8b 6d 08 mov 0x8(%rbp),%r13 127a: ba 03 00 00 00 mov $0x3,%edx 127f: 48 8d 35 89 9f 00 00 lea 0x9f89(%rip),%rsi # b20f <_IO_stdin_used+0x20f> 1286: 4c 89 ef mov %r13,%rdi 1289: e8 a2 fd ff ff callq 1030 128e: 85 c0 test %eax,%eax 1290: 0f 85 3d 17 00 00 jne 29d3 1296: 48 8d 35 01 5b 00 00 lea 0x5b01(%rip),%rsi # 6d9e 129d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 12a3: 4c 89 f7 mov %r14,%rdi 12a6: e8 95 97 00 00 callq aa40 12ab: bf 01 00 00 00 mov $0x1,%edi 12b0: b8 01 00 00 00 mov $0x1,%eax 12b5: 48 8d 35 61 a4 00 00 lea 0xa461(%rip),%rsi # b71d <_IO_stdin_used+0x71d> 12bc: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 12c0: e8 bb fd ff ff callq 1080 <__printf_chk@plt> 12c5: 4c 8b 6d 08 mov 0x8(%rbp),%r13 12c9: 48 8d 35 fb 9e 00 00 lea 0x9efb(%rip),%rsi # b1cb <_IO_stdin_used+0x1cb> 12d0: 4c 89 ef mov %r13,%rdi 12d3: e8 88 fd ff ff callq 1060 12d8: 85 c0 test %eax,%eax 12da: 0f 85 0a 17 00 00 jne 29ea 12e0: 4c 8d 3d 43 5b 00 00 lea 0x5b43(%rip),%r15 # 6e2a 12e7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 12ed: 4c 89 f7 mov %r14,%rdi 12f0: 4c 89 fe mov %r15,%rsi 12f3: e8 48 97 00 00 callq aa40 12f8: bf 01 00 00 00 mov $0x1,%edi 12fd: b8 01 00 00 00 mov $0x1,%eax 1302: 48 8d 35 f7 a3 00 00 lea 0xa3f7(%rip),%rsi # b700 <_IO_stdin_used+0x700> 1309: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 130d: e8 6e fd ff ff callq 1080 <__printf_chk@plt> 1312: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1316: 48 8d 35 b5 9e 00 00 lea 0x9eb5(%rip),%rsi # b1d2 <_IO_stdin_used+0x1d2> 131d: 4c 89 ef mov %r13,%rdi 1320: e8 3b fd ff ff callq 1060 1325: 85 c0 test %eax,%eax 1327: 0f 85 d4 16 00 00 jne 2a01 132d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1333: 4c 89 fe mov %r15,%rsi 1336: 4c 89 f7 mov %r14,%rdi 1339: e8 02 97 00 00 callq aa40 133e: bf 01 00 00 00 mov $0x1,%edi 1343: b8 01 00 00 00 mov $0x1,%eax 1348: 48 8d 35 94 a3 00 00 lea 0xa394(%rip),%rsi # b6e3 <_IO_stdin_used+0x6e3> 134f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1353: e8 28 fd ff ff callq 1080 <__printf_chk@plt> 1358: 4c 8b 6d 08 mov 0x8(%rbp),%r13 135c: ba 06 00 00 00 mov $0x6,%edx 1361: 48 8d 35 73 9e 00 00 lea 0x9e73(%rip),%rsi # b1db <_IO_stdin_used+0x1db> 1368: 4c 89 ef mov %r13,%rdi 136b: e8 c0 fc ff ff callq 1030 1370: 85 c0 test %eax,%eax 1372: 0f 85 a5 16 00 00 jne 2a1d 1378: 48 8d 35 c8 91 00 00 lea 0x91c8(%rip),%rsi # a547 137f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1385: 4c 89 f7 mov %r14,%rdi 1388: e8 b3 96 00 00 callq aa40 138d: bf 01 00 00 00 mov $0x1,%edi 1392: b8 01 00 00 00 mov $0x1,%eax 1397: 48 8d 35 27 a3 00 00 lea 0xa327(%rip),%rsi # b6c5 <_IO_stdin_used+0x6c5> 139e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 13a2: e8 d9 fc ff ff callq 1080 <__printf_chk@plt> 13a7: 4c 8b 6d 08 mov 0x8(%rbp),%r13 13ab: ba 08 00 00 00 mov $0x8,%edx 13b0: 48 8d 35 22 9e 00 00 lea 0x9e22(%rip),%rsi # b1d9 <_IO_stdin_used+0x1d9> 13b7: 4c 89 ef mov %r13,%rdi 13ba: e8 71 fc ff ff callq 1030 13bf: 85 c0 test %eax,%eax 13c1: 0f 85 72 16 00 00 jne 2a39 13c7: 48 8d 35 ee 91 00 00 lea 0x91ee(%rip),%rsi # a5bc 13ce: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 13d4: 4c 89 f7 mov %r14,%rdi 13d7: e8 64 96 00 00 callq aa40 13dc: bf 01 00 00 00 mov $0x1,%edi 13e1: b8 01 00 00 00 mov $0x1,%eax 13e6: 48 8d 35 3b ac 00 00 lea 0xac3b(%rip),%rsi # c028 <_IO_stdin_used+0x1028> 13ed: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 13f1: e8 8a fc ff ff callq 1080 <__printf_chk@plt> 13f6: 4c 8b 6d 08 mov 0x8(%rbp),%r13 13fa: ba 07 00 00 00 mov $0x7,%edx 13ff: 48 8d 35 dc 9d 00 00 lea 0x9ddc(%rip),%rsi # b1e2 <_IO_stdin_used+0x1e2> 1406: 4c 89 ef mov %r13,%rdi 1409: e8 22 fc ff ff callq 1030 140e: 85 c0 test %eax,%eax 1410: 0f 85 3f 16 00 00 jne 2a55 1416: 48 8d 35 e1 92 00 00 lea 0x92e1(%rip),%rsi # a6fe 141d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1423: 4c 89 f7 mov %r14,%rdi 1426: e8 15 96 00 00 callq aa40 142b: bf 01 00 00 00 mov $0x1,%edi 1430: b8 01 00 00 00 mov $0x1,%eax 1435: 48 8d 35 71 a2 00 00 lea 0xa271(%rip),%rsi # b6ad <_IO_stdin_used+0x6ad> 143c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1440: e8 3b fc ff ff callq 1080 <__printf_chk@plt> 1445: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1449: ba 07 00 00 00 mov $0x7,%edx 144e: 48 8d 35 95 9d 00 00 lea 0x9d95(%rip),%rsi # b1ea <_IO_stdin_used+0x1ea> 1455: 4c 89 ef mov %r13,%rdi 1458: e8 d3 fb ff ff callq 1030 145d: 85 c0 test %eax,%eax 145f: 0f 85 0c 16 00 00 jne 2a71 1465: 48 8d 35 c7 91 00 00 lea 0x91c7(%rip),%rsi # a633 146c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1472: 4c 89 f7 mov %r14,%rdi 1475: e8 c6 95 00 00 callq aa40 147a: bf 01 00 00 00 mov $0x1,%edi 147f: b8 01 00 00 00 mov $0x1,%eax 1484: 48 8d 35 0a a2 00 00 lea 0xa20a(%rip),%rsi # b695 <_IO_stdin_used+0x695> 148b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 148f: e8 ec fb ff ff callq 1080 <__printf_chk@plt> 1494: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1498: ba 07 00 00 00 mov $0x7,%edx 149d: 48 8d 35 4e 9d 00 00 lea 0x9d4e(%rip),%rsi # b1f2 <_IO_stdin_used+0x1f2> 14a4: 4c 89 ef mov %r13,%rdi 14a7: e8 84 fb ff ff callq 1030 14ac: 85 c0 test %eax,%eax 14ae: 0f 85 d9 15 00 00 jne 2a8d 14b4: 48 8d 35 ba 92 00 00 lea 0x92ba(%rip),%rsi # a775 14bb: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 14c1: 4c 89 f7 mov %r14,%rdi 14c4: e8 77 95 00 00 callq aa40 14c9: bf 01 00 00 00 mov $0x1,%edi 14ce: b8 01 00 00 00 mov $0x1,%eax 14d3: 48 8d 35 a3 a1 00 00 lea 0xa1a3(%rip),%rsi # b67d <_IO_stdin_used+0x67d> 14da: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 14de: e8 9d fb ff ff callq 1080 <__printf_chk@plt> 14e3: 4c 8b 6d 08 mov 0x8(%rbp),%r13 14e7: ba 06 00 00 00 mov $0x6,%edx 14ec: 48 8d 35 07 9d 00 00 lea 0x9d07(%rip),%rsi # b1fa <_IO_stdin_used+0x1fa> 14f3: 4c 89 ef mov %r13,%rdi 14f6: e8 35 fb ff ff callq 1030 14fb: 85 c0 test %eax,%eax 14fd: 0f 85 a6 15 00 00 jne 2aa9 1503: 48 8d 35 6d 93 00 00 lea 0x936d(%rip),%rsi # a877 150a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1510: 4c 89 f7 mov %r14,%rdi 1513: e8 28 95 00 00 callq aa40 1518: bf 01 00 00 00 mov $0x1,%edi 151d: b8 01 00 00 00 mov $0x1,%eax 1522: 48 8d 35 3d a1 00 00 lea 0xa13d(%rip),%rsi # b666 <_IO_stdin_used+0x666> 1529: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 152d: e8 4e fb ff ff callq 1080 <__printf_chk@plt> 1532: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1536: ba 06 00 00 00 mov $0x6,%edx 153b: 48 8d 35 bf 9c 00 00 lea 0x9cbf(%rip),%rsi # b201 <_IO_stdin_used+0x201> 1542: 4c 89 ef mov %r13,%rdi 1545: e8 e6 fa ff ff callq 1030 154a: 85 c0 test %eax,%eax 154c: 0f 85 73 15 00 00 jne 2ac5 1552: 48 8d 35 95 93 00 00 lea 0x9395(%rip),%rsi # a8ee 1559: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 155f: 4c 89 f7 mov %r14,%rdi 1562: e8 d9 94 00 00 callq aa40 1567: bf 01 00 00 00 mov $0x1,%edi 156c: b8 01 00 00 00 mov $0x1,%eax 1571: 48 8d 35 d7 a0 00 00 lea 0xa0d7(%rip),%rsi # b64f <_IO_stdin_used+0x64f> 1578: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 157c: e8 ff fa ff ff callq 1080 <__printf_chk@plt> 1581: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1585: ba 06 00 00 00 mov $0x6,%edx 158a: 48 8d 35 70 9c 00 00 lea 0x9c70(%rip),%rsi # b201 <_IO_stdin_used+0x201> 1591: 4c 89 ef mov %r13,%rdi 1594: e8 97 fa ff ff callq 1030 1599: 85 c0 test %eax,%eax 159b: 75 33 jne 15d0 159d: 48 8d 35 48 92 00 00 lea 0x9248(%rip),%rsi # a7ec 15a4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 15aa: 4c 89 f7 mov %r14,%rdi 15ad: e8 8e 94 00 00 callq aa40 15b2: bf 01 00 00 00 mov $0x1,%edi 15b7: b8 01 00 00 00 mov $0x1,%eax 15bc: 48 8d 35 3d aa 00 00 lea 0xaa3d(%rip),%rsi # c000 <_IO_stdin_used+0x1000> 15c3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 15c7: e8 b4 fa ff ff callq 1080 <__printf_chk@plt> 15cc: 4c 8b 6d 08 mov 0x8(%rbp),%r13 15d0: ba 06 00 00 00 mov $0x6,%edx 15d5: 48 8d 35 8b a1 00 00 lea 0xa18b(%rip),%rsi # b767 <_IO_stdin_used+0x767> 15dc: 4c 89 ef mov %r13,%rdi 15df: e8 4c fa ff ff callq 1030 15e4: 85 c0 test %eax,%eax 15e6: 0f 85 f5 14 00 00 jne 2ae1 15ec: 48 8d 35 b3 56 00 00 lea 0x56b3(%rip),%rsi # 6ca6 15f3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 15f9: 4c 89 f7 mov %r14,%rdi 15fc: e8 3f 94 00 00 callq aa40 1601: bf 01 00 00 00 mov $0x1,%edi 1606: b8 01 00 00 00 mov $0x1,%eax 160b: 48 8d 35 c6 a9 00 00 lea 0xa9c6(%rip),%rsi # bfd8 <_IO_stdin_used+0xfd8> 1612: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1616: e8 65 fa ff ff callq 1080 <__printf_chk@plt> 161b: 4c 8b 6d 08 mov 0x8(%rbp),%r13 161f: ba 0a 00 00 00 mov $0xa,%edx 1624: 48 8d 35 dd 9b 00 00 lea 0x9bdd(%rip),%rsi # b208 <_IO_stdin_used+0x208> 162b: 4c 89 ef mov %r13,%rdi 162e: e8 fd f9 ff ff callq 1030 1633: 85 c0 test %eax,%eax 1635: 0f 85 c2 14 00 00 jne 2afd 163b: 48 8d 35 62 61 00 00 lea 0x6162(%rip),%rsi # 77a4 1642: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1648: 4c 89 f7 mov %r14,%rdi 164b: e8 f0 93 00 00 callq aa40 1650: bf 01 00 00 00 mov $0x1,%edi 1655: b8 01 00 00 00 mov $0x1,%eax 165a: 48 8d 35 d1 9f 00 00 lea 0x9fd1(%rip),%rsi # b632 <_IO_stdin_used+0x632> 1661: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1665: e8 16 fa ff ff callq 1080 <__printf_chk@plt> 166a: 4c 8b 6d 08 mov 0x8(%rbp),%r13 166e: ba 06 00 00 00 mov $0x6,%edx 1673: 48 8d 35 99 9b 00 00 lea 0x9b99(%rip),%rsi # b213 <_IO_stdin_used+0x213> 167a: 4c 89 ef mov %r13,%rdi 167d: e8 ae f9 ff ff callq 1030 1682: 85 c0 test %eax,%eax 1684: 0f 85 8f 14 00 00 jne 2b19 168a: 48 8d 35 56 60 00 00 lea 0x6056(%rip),%rsi # 76e7 1691: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1697: 4c 89 f7 mov %r14,%rdi 169a: e8 a1 93 00 00 callq aa40 169f: bf 01 00 00 00 mov $0x1,%edi 16a4: b8 01 00 00 00 mov $0x1,%eax 16a9: 48 8d 35 67 9f 00 00 lea 0x9f67(%rip),%rsi # b617 <_IO_stdin_used+0x617> 16b0: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 16b4: e8 c7 f9 ff ff callq 1080 <__printf_chk@plt> 16b9: 4c 8b 6d 08 mov 0x8(%rbp),%r13 16bd: ba 03 00 00 00 mov $0x3,%edx 16c2: 48 8d 35 53 9b 00 00 lea 0x9b53(%rip),%rsi # b21c <_IO_stdin_used+0x21c> 16c9: 4c 89 ef mov %r13,%rdi 16cc: e8 5f f9 ff ff callq 1030 16d1: 85 c0 test %eax,%eax 16d3: 0f 85 5c 14 00 00 jne 2b35 16d9: 48 8d 35 11 5e 00 00 lea 0x5e11(%rip),%rsi # 74f1 16e0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 16e6: 4c 89 f7 mov %r14,%rdi 16e9: e8 52 93 00 00 callq aa40 16ee: bf 01 00 00 00 mov $0x1,%edi 16f3: b8 01 00 00 00 mov $0x1,%eax 16f8: 48 8d 35 ff 9e 00 00 lea 0x9eff(%rip),%rsi # b5fe <_IO_stdin_used+0x5fe> 16ff: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1703: e8 78 f9 ff ff callq 1080 <__printf_chk@plt> 1708: 4c 8b 6d 08 mov 0x8(%rbp),%r13 170c: ba 05 00 00 00 mov $0x5,%edx 1711: 48 8d 35 02 9b 00 00 lea 0x9b02(%rip),%rsi # b21a <_IO_stdin_used+0x21a> 1718: 4c 89 ef mov %r13,%rdi 171b: e8 10 f9 ff ff callq 1030 1720: 85 c0 test %eax,%eax 1722: 0f 85 29 14 00 00 jne 2b51 1728: 48 8d 35 9a 5e 00 00 lea 0x5e9a(%rip),%rsi # 75c9 172f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1735: 4c 89 f7 mov %r14,%rdi 1738: e8 03 93 00 00 callq aa40 173d: bf 01 00 00 00 mov $0x1,%edi 1742: b8 01 00 00 00 mov $0x1,%eax 1747: 48 8d 35 9a 9e 00 00 lea 0x9e9a(%rip),%rsi # b5e8 <_IO_stdin_used+0x5e8> 174e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1752: e8 29 f9 ff ff callq 1080 <__printf_chk@plt> 1757: 4c 8b 6d 08 mov 0x8(%rbp),%r13 175b: ba 04 00 00 00 mov $0x4,%edx 1760: 48 8d 35 b9 9a 00 00 lea 0x9ab9(%rip),%rsi # b220 <_IO_stdin_used+0x220> 1767: 4c 89 ef mov %r13,%rdi 176a: e8 c1 f8 ff ff callq 1030 176f: 85 c0 test %eax,%eax 1771: 0f 85 f6 13 00 00 jne 2b6d 1777: 48 8d 35 b2 8b 00 00 lea 0x8bb2(%rip),%rsi # a330 177e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1784: 4c 89 f7 mov %r14,%rdi 1787: e8 b4 92 00 00 callq aa40 178c: bf 01 00 00 00 mov $0x1,%edi 1791: b8 01 00 00 00 mov $0x1,%eax 1796: 48 8d 35 37 9e 00 00 lea 0x9e37(%rip),%rsi # b5d4 <_IO_stdin_used+0x5d4> 179d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 17a1: e8 da f8 ff ff callq 1080 <__printf_chk@plt> 17a6: 4c 8b 6d 08 mov 0x8(%rbp),%r13 17aa: ba 04 00 00 00 mov $0x4,%edx 17af: 48 8d 35 6f 9a 00 00 lea 0x9a6f(%rip),%rsi # b225 <_IO_stdin_used+0x225> 17b6: 4c 89 ef mov %r13,%rdi 17b9: e8 72 f8 ff ff callq 1030 17be: 85 c0 test %eax,%eax 17c0: 0f 85 c3 13 00 00 jne 2b89 17c6: 48 8d 35 c6 8c 00 00 lea 0x8cc6(%rip),%rsi # a493 17cd: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 17d3: 4c 89 f7 mov %r14,%rdi 17d6: e8 65 92 00 00 callq aa40 17db: bf 01 00 00 00 mov $0x1,%edi 17e0: b8 01 00 00 00 mov $0x1,%eax 17e5: 48 8d 35 d4 9d 00 00 lea 0x9dd4(%rip),%rsi # b5c0 <_IO_stdin_used+0x5c0> 17ec: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 17f0: e8 8b f8 ff ff callq 1080 <__printf_chk@plt> 17f5: 4c 8b 6d 08 mov 0x8(%rbp),%r13 17f9: ba 07 00 00 00 mov $0x7,%edx 17fe: 48 8d 35 25 9a 00 00 lea 0x9a25(%rip),%rsi # b22a <_IO_stdin_used+0x22a> 1805: 4c 89 ef mov %r13,%rdi 1808: e8 23 f8 ff ff callq 1030 180d: 85 c0 test %eax,%eax 180f: 0f 85 90 13 00 00 jne 2ba5 1815: 48 8d 35 c8 8b 00 00 lea 0x8bc8(%rip),%rsi # a3e4 181c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1822: 4c 89 f7 mov %r14,%rdi 1825: e8 16 92 00 00 callq aa40 182a: bf 01 00 00 00 mov $0x1,%edi 182f: b8 01 00 00 00 mov $0x1,%eax 1834: 48 8d 35 69 9d 00 00 lea 0x9d69(%rip),%rsi # b5a4 <_IO_stdin_used+0x5a4> 183b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 183f: e8 3c f8 ff ff callq 1080 <__printf_chk@plt> 1844: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1848: ba 03 00 00 00 mov $0x3,%edx 184d: 48 8d 35 e8 99 00 00 lea 0x99e8(%rip),%rsi # b23c <_IO_stdin_used+0x23c> 1854: 4c 89 ef mov %r13,%rdi 1857: e8 d4 f7 ff ff callq 1030 185c: 85 c0 test %eax,%eax 185e: 0f 85 5d 13 00 00 jne 2bc1 1864: 48 8d 35 5b 57 00 00 lea 0x575b(%rip),%rsi # 6fc6 186b: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1871: 4c 89 f7 mov %r14,%rdi 1874: e8 c7 91 00 00 callq aa40 1879: bf 01 00 00 00 mov $0x1,%edi 187e: b8 01 00 00 00 mov $0x1,%eax 1883: 48 8d 35 03 9d 00 00 lea 0x9d03(%rip),%rsi # b58d <_IO_stdin_used+0x58d> 188a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 188e: e8 ed f7 ff ff callq 1080 <__printf_chk@plt> 1893: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1897: ba 03 00 00 00 mov $0x3,%edx 189c: 48 8d 35 8f 99 00 00 lea 0x998f(%rip),%rsi # b232 <_IO_stdin_used+0x232> 18a3: 4c 89 ef mov %r13,%rdi 18a6: e8 85 f7 ff ff callq 1030 18ab: 85 c0 test %eax,%eax 18ad: 0f 85 2a 13 00 00 jne 2bdd 18b3: 48 8d 35 80 56 00 00 lea 0x5680(%rip),%rsi # 6f3a 18ba: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 18c0: 4c 89 f7 mov %r14,%rdi 18c3: e8 78 91 00 00 callq aa40 18c8: bf 01 00 00 00 mov $0x1,%edi 18cd: b8 01 00 00 00 mov $0x1,%eax 18d2: 48 8d 35 9d 9c 00 00 lea 0x9c9d(%rip),%rsi # b576 <_IO_stdin_used+0x576> 18d9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 18dd: e8 9e f7 ff ff callq 1080 <__printf_chk@plt> 18e2: 4c 8b 6d 08 mov 0x8(%rbp),%r13 18e6: ba 09 00 00 00 mov $0x9,%edx 18eb: 48 8d 35 44 99 00 00 lea 0x9944(%rip),%rsi # b236 <_IO_stdin_used+0x236> 18f2: 4c 89 ef mov %r13,%rdi 18f5: e8 36 f7 ff ff callq 1030 18fa: 85 c0 test %eax,%eax 18fc: 0f 85 f7 12 00 00 jne 2bf9 1902: 48 8d 35 49 57 00 00 lea 0x5749(%rip),%rsi # 7052 1909: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 190f: 4c 89 f7 mov %r14,%rdi 1912: e8 29 91 00 00 callq aa40 1917: bf 01 00 00 00 mov $0x1,%edi 191c: b8 01 00 00 00 mov $0x1,%eax 1921: 48 8d 35 90 a6 00 00 lea 0xa690(%rip),%rsi # bfb8 <_IO_stdin_used+0xfb8> 1928: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 192c: e8 4f f7 ff ff callq 1080 <__printf_chk@plt> 1931: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1935: ba 03 00 00 00 mov $0x3,%edx 193a: 48 8d 35 ff 98 00 00 lea 0x98ff(%rip),%rsi # b240 <_IO_stdin_used+0x240> 1941: 4c 89 ef mov %r13,%rdi 1944: e8 e7 f6 ff ff callq 1030 1949: 85 c0 test %eax,%eax 194b: 0f 85 c4 12 00 00 jne 2c15 1951: 48 8d 35 86 57 00 00 lea 0x5786(%rip),%rsi # 70de 1958: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 195e: 4c 89 f7 mov %r14,%rdi 1961: e8 da 90 00 00 callq aa40 1966: bf 01 00 00 00 mov $0x1,%edi 196b: b8 01 00 00 00 mov $0x1,%eax 1970: 48 8d 35 e4 9b 00 00 lea 0x9be4(%rip),%rsi # b55b <_IO_stdin_used+0x55b> 1977: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 197b: e8 00 f7 ff ff callq 1080 <__printf_chk@plt> 1980: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1984: ba 03 00 00 00 mov $0x3,%edx 1989: 48 8d 35 c0 98 00 00 lea 0x98c0(%rip),%rsi # b250 <_IO_stdin_used+0x250> 1990: 4c 89 ef mov %r13,%rdi 1993: e8 98 f6 ff ff callq 1030 1998: 85 c0 test %eax,%eax 199a: 0f 85 91 12 00 00 jne 2c31 19a0: 48 8d 35 89 58 00 00 lea 0x5889(%rip),%rsi # 7230 19a7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 19ad: 4c 89 f7 mov %r14,%rdi 19b0: e8 8b 90 00 00 callq aa40 19b5: bf 01 00 00 00 mov $0x1,%edi 19ba: b8 01 00 00 00 mov $0x1,%eax 19bf: 48 8d 35 82 9b 00 00 lea 0x9b82(%rip),%rsi # b548 <_IO_stdin_used+0x548> 19c6: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 19ca: e8 b1 f6 ff ff callq 1080 <__printf_chk@plt> 19cf: 4c 8b 6d 08 mov 0x8(%rbp),%r13 19d3: ba 09 00 00 00 mov $0x9,%edx 19d8: 48 8d 35 6b 98 00 00 lea 0x986b(%rip),%rsi # b24a <_IO_stdin_used+0x24a> 19df: 4c 89 ef mov %r13,%rdi 19e2: e8 49 f6 ff ff callq 1030 19e7: 85 c0 test %eax,%eax 19e9: 0f 85 5e 12 00 00 jne 2c4d 19ef: 48 8d 35 33 5a 00 00 lea 0x5a33(%rip),%rsi # 7429 19f6: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 19fc: 4c 89 f7 mov %r14,%rdi 19ff: e8 3c 90 00 00 callq aa40 1a04: bf 01 00 00 00 mov $0x1,%edi 1a09: b8 01 00 00 00 mov $0x1,%eax 1a0e: 48 8d 35 18 9b 00 00 lea 0x9b18(%rip),%rsi # b52d <_IO_stdin_used+0x52d> 1a15: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1a19: e8 62 f6 ff ff callq 1080 <__printf_chk@plt> 1a1e: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1a22: ba 09 00 00 00 mov $0x9,%edx 1a27: 48 8d 35 26 98 00 00 lea 0x9826(%rip),%rsi # b254 <_IO_stdin_used+0x254> 1a2e: 4c 89 ef mov %r13,%rdi 1a31: e8 fa f5 ff ff callq 1030 1a36: 85 c0 test %eax,%eax 1a38: 0f 85 2b 12 00 00 jne 2c69 1a3e: 48 8d 35 4e 57 00 00 lea 0x574e(%rip),%rsi # 7193 1a45: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1a4b: 4c 89 f7 mov %r14,%rdi 1a4e: e8 ed 8f 00 00 callq aa40 1a53: bf 01 00 00 00 mov $0x1,%edi 1a58: b8 01 00 00 00 mov $0x1,%eax 1a5d: 48 8d 35 ae 9a 00 00 lea 0x9aae(%rip),%rsi # b512 <_IO_stdin_used+0x512> 1a64: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1a68: e8 13 f6 ff ff callq 1080 <__printf_chk@plt> 1a6d: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1a71: ba 03 00 00 00 mov $0x3,%edx 1a76: 48 8d 35 e7 97 00 00 lea 0x97e7(%rip),%rsi # b264 <_IO_stdin_used+0x264> 1a7d: 4c 89 ef mov %r13,%rdi 1a80: e8 ab f5 ff ff callq 1030 1a85: 85 c0 test %eax,%eax 1a87: 0f 85 f8 11 00 00 jne 2c85 1a8d: 48 8d 35 3f 58 00 00 lea 0x583f(%rip),%rsi # 72d3 1a94: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1a9a: 4c 89 f7 mov %r14,%rdi 1a9d: e8 9e 8f 00 00 callq aa40 1aa2: bf 01 00 00 00 mov $0x1,%edi 1aa7: b8 01 00 00 00 mov $0x1,%eax 1aac: 48 8d 35 46 9a 00 00 lea 0x9a46(%rip),%rsi # b4f9 <_IO_stdin_used+0x4f9> 1ab3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1ab7: e8 c4 f5 ff ff callq 1080 <__printf_chk@plt> 1abc: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1ac0: ba 09 00 00 00 mov $0x9,%edx 1ac5: 48 8d 35 92 97 00 00 lea 0x9792(%rip),%rsi # b25e <_IO_stdin_used+0x25e> 1acc: 4c 89 ef mov %r13,%rdi 1acf: e8 5c f5 ff ff callq 1030 1ad4: 85 c0 test %eax,%eax 1ad6: 0f 85 c5 11 00 00 jne 2ca1 1adc: 48 8d 35 93 58 00 00 lea 0x5893(%rip),%rsi # 7376 1ae3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1ae9: 4c 89 f7 mov %r14,%rdi 1aec: e8 4f 8f 00 00 callq aa40 1af1: bf 01 00 00 00 mov $0x1,%edi 1af6: b8 01 00 00 00 mov $0x1,%eax 1afb: 48 8d 35 8e a4 00 00 lea 0xa48e(%rip),%rsi # bf90 <_IO_stdin_used+0xf90> 1b02: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1b06: e8 75 f5 ff ff callq 1080 <__printf_chk@plt> 1b0b: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1b0f: ba 09 00 00 00 mov $0x9,%edx 1b14: 48 8d 35 5f 97 00 00 lea 0x975f(%rip),%rsi # b27a <_IO_stdin_used+0x27a> 1b1b: 4c 89 ef mov %r13,%rdi 1b1e: e8 0d f5 ff ff callq 1030 1b23: 85 c0 test %eax,%eax 1b25: 0f 85 92 11 00 00 jne 2cbd 1b2b: 48 8d 35 59 5d 00 00 lea 0x5d59(%rip),%rsi # 788b 1b32: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1b38: 4c 89 f7 mov %r14,%rdi 1b3b: e8 00 8f 00 00 callq aa40 1b40: bf 01 00 00 00 mov $0x1,%edi 1b45: b8 01 00 00 00 mov $0x1,%eax 1b4a: 48 8d 35 17 a4 00 00 lea 0xa417(%rip),%rsi # bf68 <_IO_stdin_used+0xf68> 1b51: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1b55: e8 26 f5 ff ff callq 1080 <__printf_chk@plt> 1b5a: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1b5e: ba 0c 00 00 00 mov $0xc,%edx 1b63: 48 8d 35 fe 96 00 00 lea 0x96fe(%rip),%rsi # b268 <_IO_stdin_used+0x268> 1b6a: 4c 89 ef mov %r13,%rdi 1b6d: e8 be f4 ff ff callq 1030 1b72: 85 c0 test %eax,%eax 1b74: 0f 85 5f 11 00 00 jne 2cd9 1b7a: 48 8d 35 c3 61 00 00 lea 0x61c3(%rip),%rsi # 7d44 1b81: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1b87: 4c 89 f7 mov %r14,%rdi 1b8a: e8 b1 8e 00 00 callq aa40 1b8f: bf 01 00 00 00 mov $0x1,%edi 1b94: b8 01 00 00 00 mov $0x1,%eax 1b99: 48 8d 35 90 a3 00 00 lea 0xa390(%rip),%rsi # bf30 <_IO_stdin_used+0xf30> 1ba0: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1ba4: e8 d7 f4 ff ff callq 1080 <__printf_chk@plt> 1ba9: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1bad: ba 0e 00 00 00 mov $0xe,%edx 1bb2: 48 8d 35 bc 96 00 00 lea 0x96bc(%rip),%rsi # b275 <_IO_stdin_used+0x275> 1bb9: 4c 89 ef mov %r13,%rdi 1bbc: e8 6f f4 ff ff callq 1030 1bc1: 85 c0 test %eax,%eax 1bc3: 0f 85 2c 11 00 00 jne 2cf5 1bc9: 48 8d 35 39 62 00 00 lea 0x6239(%rip),%rsi # 7e09 1bd0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1bd6: 4c 89 f7 mov %r14,%rdi 1bd9: e8 62 8e 00 00 callq aa40 1bde: bf 01 00 00 00 mov $0x1,%edi 1be3: b8 01 00 00 00 mov $0x1,%eax 1be8: 48 8d 35 09 a3 00 00 lea 0xa309(%rip),%rsi # bef8 <_IO_stdin_used+0xef8> 1bef: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1bf3: e8 88 f4 ff ff callq 1080 <__printf_chk@plt> 1bf8: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1bfc: ba 0e 00 00 00 mov $0xe,%edx 1c01: 48 8d 35 7c 96 00 00 lea 0x967c(%rip),%rsi # b284 <_IO_stdin_used+0x284> 1c08: 4c 89 ef mov %r13,%rdi 1c0b: e8 20 f4 ff ff callq 1030 1c10: 85 c0 test %eax,%eax 1c12: 0f 85 f9 10 00 00 jne 2d11 1c18: 48 8d 35 2c 5f 00 00 lea 0x5f2c(%rip),%rsi # 7b4b 1c1f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1c25: 4c 89 f7 mov %r14,%rdi 1c28: e8 13 8e 00 00 callq aa40 1c2d: bf 01 00 00 00 mov $0x1,%edi 1c32: b8 01 00 00 00 mov $0x1,%eax 1c37: 48 8d 35 8a a2 00 00 lea 0xa28a(%rip),%rsi # bec8 <_IO_stdin_used+0xec8> 1c3e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1c42: e8 39 f4 ff ff callq 1080 <__printf_chk@plt> 1c47: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1c4b: ba 08 00 00 00 mov $0x8,%edx 1c50: 48 8d 35 3c 96 00 00 lea 0x963c(%rip),%rsi # b293 <_IO_stdin_used+0x293> 1c57: 4c 89 ef mov %r13,%rdi 1c5a: e8 d1 f3 ff ff callq 1030 1c5f: 85 c0 test %eax,%eax 1c61: 0f 85 c6 10 00 00 jne 2d2d 1c67: 48 8d 35 32 60 00 00 lea 0x6032(%rip),%rsi # 7ca0 1c6e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1c74: 4c 89 f7 mov %r14,%rdi 1c77: e8 c4 8d 00 00 callq aa40 1c7c: bf 01 00 00 00 mov $0x1,%edi 1c81: b8 01 00 00 00 mov $0x1,%eax 1c86: 48 8d 35 0b a2 00 00 lea 0xa20b(%rip),%rsi # be98 <_IO_stdin_used+0xe98> 1c8d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1c91: e8 ea f3 ff ff callq 1080 <__printf_chk@plt> 1c96: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1c9a: ba 0c 00 00 00 mov $0xc,%edx 1c9f: 48 8d 35 f6 95 00 00 lea 0x95f6(%rip),%rsi # b29c <_IO_stdin_used+0x29c> 1ca6: 4c 89 ef mov %r13,%rdi 1ca9: e8 82 f3 ff ff callq 1030 1cae: 85 c0 test %eax,%eax 1cb0: 0f 85 93 10 00 00 jne 2d49 1cb6: 48 8d 35 f3 61 00 00 lea 0x61f3(%rip),%rsi # 7eb0 1cbd: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1cc3: 4c 89 f7 mov %r14,%rdi 1cc6: e8 75 8d 00 00 callq aa40 1ccb: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 1cd1: bf 01 00 00 00 mov $0x1,%edi 1cd6: 48 8d 35 8b a1 00 00 lea 0xa18b(%rip),%rsi # be68 <_IO_stdin_used+0xe68> 1cdd: b8 01 00 00 00 mov $0x1,%eax 1ce2: f3 0f 5e f8 divss %xmm0,%xmm7 1ce6: 66 0f ef c0 pxor %xmm0,%xmm0 1cea: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 1cee: e8 8d f3 ff ff callq 1080 <__printf_chk@plt> 1cf3: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1cf7: ba 0c 00 00 00 mov $0xc,%edx 1cfc: 48 8d 35 a6 95 00 00 lea 0x95a6(%rip),%rsi # b2a9 <_IO_stdin_used+0x2a9> 1d03: 4c 89 ef mov %r13,%rdi 1d06: e8 25 f3 ff ff callq 1030 1d0b: 85 c0 test %eax,%eax 1d0d: 0f 85 52 10 00 00 jne 2d65 1d13: 48 8d 35 81 65 00 00 lea 0x6581(%rip),%rsi # 829b 1d1a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1d20: bf 00 2f 68 59 mov $0x59682f00,%edi 1d25: e8 16 8d 00 00 callq aa40 1d2a: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 1d30: bf 01 00 00 00 mov $0x1,%edi 1d35: 48 8d 35 f4 a0 00 00 lea 0xa0f4(%rip),%rsi # be30 <_IO_stdin_used+0xe30> 1d3c: b8 01 00 00 00 mov $0x1,%eax 1d41: f3 0f 5e f0 divss %xmm0,%xmm6 1d45: 66 0f ef c0 pxor %xmm0,%xmm0 1d49: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 1d4d: e8 2e f3 ff ff callq 1080 <__printf_chk@plt> 1d52: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1d56: ba 0c 00 00 00 mov $0xc,%edx 1d5b: 48 8d 35 54 95 00 00 lea 0x9554(%rip),%rsi # b2b6 <_IO_stdin_used+0x2b6> 1d62: 4c 89 ef mov %r13,%rdi 1d65: e8 c6 f2 ff ff callq 1030 1d6a: 85 c0 test %eax,%eax 1d6c: 0f 85 0f 10 00 00 jne 2d81 1d72: 48 8d 35 d0 65 00 00 lea 0x65d0(%rip),%rsi # 8349 1d79: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1d7f: 4c 89 f7 mov %r14,%rdi 1d82: e8 b9 8c 00 00 callq aa40 1d87: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 1d8d: bf 01 00 00 00 mov $0x1,%edi 1d92: 48 8d 35 67 a0 00 00 lea 0xa067(%rip),%rsi # be00 <_IO_stdin_used+0xe00> 1d99: b8 01 00 00 00 mov $0x1,%eax 1d9e: f3 0f 5e e8 divss %xmm0,%xmm5 1da2: 66 0f ef c0 pxor %xmm0,%xmm0 1da6: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 1daa: e8 d1 f2 ff ff callq 1080 <__printf_chk@plt> 1daf: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1db3: ba 0c 00 00 00 mov $0xc,%edx 1db8: 48 8d 35 04 95 00 00 lea 0x9504(%rip),%rsi # b2c3 <_IO_stdin_used+0x2c3> 1dbf: 4c 89 ef mov %r13,%rdi 1dc2: e8 69 f2 ff ff callq 1030 1dc7: 85 c0 test %eax,%eax 1dc9: 0f 85 ce 0f 00 00 jne 2d9d 1dcf: 48 8d 35 c6 6a 00 00 lea 0x6ac6(%rip),%rsi # 889c 1dd6: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1ddc: bf 00 2f 68 59 mov $0x59682f00,%edi 1de1: e8 5a 8c 00 00 callq aa40 1de6: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 1dec: bf 01 00 00 00 mov $0x1,%edi 1df1: 48 8d 35 d0 9f 00 00 lea 0x9fd0(%rip),%rsi # bdc8 <_IO_stdin_used+0xdc8> 1df8: b8 01 00 00 00 mov $0x1,%eax 1dfd: f3 0f 5e f8 divss %xmm0,%xmm7 1e01: 66 0f ef c0 pxor %xmm0,%xmm0 1e05: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 1e09: e8 72 f2 ff ff callq 1080 <__printf_chk@plt> 1e0e: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1e12: ba 0b 00 00 00 mov $0xb,%edx 1e17: 48 8d 35 b2 94 00 00 lea 0x94b2(%rip),%rsi # b2d0 <_IO_stdin_used+0x2d0> 1e1e: 4c 89 ef mov %r13,%rdi 1e21: e8 0a f2 ff ff callq 1030 1e26: 85 c0 test %eax,%eax 1e28: 0f 85 8b 0f 00 00 jne 2db9 1e2e: 48 8d 35 8c 6b 00 00 lea 0x6b8c(%rip),%rsi # 89c1 1e35: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1e3b: bf 00 2f 68 59 mov $0x59682f00,%edi 1e40: e8 fb 8b 00 00 callq aa40 1e45: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 1e4b: bf 01 00 00 00 mov $0x1,%edi 1e50: 48 8d 35 49 9f 00 00 lea 0x9f49(%rip),%rsi # bda0 <_IO_stdin_used+0xda0> 1e57: b8 01 00 00 00 mov $0x1,%eax 1e5c: f3 0f 5e f0 divss %xmm0,%xmm6 1e60: 66 0f ef c0 pxor %xmm0,%xmm0 1e64: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 1e68: e8 13 f2 ff ff callq 1080 <__printf_chk@plt> 1e6d: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1e71: ba 0b 00 00 00 mov $0xb,%edx 1e76: 48 8d 35 5f 94 00 00 lea 0x945f(%rip),%rsi # b2dc <_IO_stdin_used+0x2dc> 1e7d: 4c 89 ef mov %r13,%rdi 1e80: e8 ab f1 ff ff callq 1030 1e85: 85 c0 test %eax,%eax 1e87: 0f 85 48 0f 00 00 jne 2dd5 1e8d: 48 8d 35 e7 6c 00 00 lea 0x6ce7(%rip),%rsi # 8b7b 1e94: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1e9a: bf 00 2f 68 59 mov $0x59682f00,%edi 1e9f: e8 9c 8b 00 00 callq aa40 1ea4: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 1eaa: bf 01 00 00 00 mov $0x1,%edi 1eaf: 48 8d 35 c2 9e 00 00 lea 0x9ec2(%rip),%rsi # bd78 <_IO_stdin_used+0xd78> 1eb6: b8 01 00 00 00 mov $0x1,%eax 1ebb: f3 0f 5e e8 divss %xmm0,%xmm5 1ebf: 66 0f ef c0 pxor %xmm0,%xmm0 1ec3: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 1ec7: e8 b4 f1 ff ff callq 1080 <__printf_chk@plt> 1ecc: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1ed0: ba 0b 00 00 00 mov $0xb,%edx 1ed5: 48 8d 35 0c 94 00 00 lea 0x940c(%rip),%rsi # b2e8 <_IO_stdin_used+0x2e8> 1edc: 4c 89 ef mov %r13,%rdi 1edf: e8 4c f1 ff ff callq 1030 1ee4: 85 c0 test %eax,%eax 1ee6: 0f 85 05 0f 00 00 jne 2df1 1eec: 48 8d 35 99 79 00 00 lea 0x7999(%rip),%rsi # 988c 1ef3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1ef9: bf 00 2f 68 59 mov $0x59682f00,%edi 1efe: e8 3d 8b 00 00 callq aa40 1f03: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 1f09: bf 01 00 00 00 mov $0x1,%edi 1f0e: 48 8d 35 3b 9e 00 00 lea 0x9e3b(%rip),%rsi # bd50 <_IO_stdin_used+0xd50> 1f15: b8 01 00 00 00 mov $0x1,%eax 1f1a: f3 0f 5e f8 divss %xmm0,%xmm7 1f1e: 66 0f ef c0 pxor %xmm0,%xmm0 1f22: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 1f26: e8 55 f1 ff ff callq 1080 <__printf_chk@plt> 1f2b: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1f2f: ba 0b 00 00 00 mov $0xb,%edx 1f34: 48 8d 35 b9 93 00 00 lea 0x93b9(%rip),%rsi # b2f4 <_IO_stdin_used+0x2f4> 1f3b: 4c 89 ef mov %r13,%rdi 1f3e: e8 ed f0 ff ff callq 1030 1f43: 85 c0 test %eax,%eax 1f45: 0f 85 c2 0e 00 00 jne 2e0d 1f4b: 48 8d 35 9a 79 00 00 lea 0x799a(%rip),%rsi # 98ec 1f52: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1f58: bf 00 2f 68 59 mov $0x59682f00,%edi 1f5d: e8 de 8a 00 00 callq aa40 1f62: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 1f68: bf 01 00 00 00 mov $0x1,%edi 1f6d: 48 8d 35 b4 9d 00 00 lea 0x9db4(%rip),%rsi # bd28 <_IO_stdin_used+0xd28> 1f74: b8 01 00 00 00 mov $0x1,%eax 1f79: f3 0f 5e f0 divss %xmm0,%xmm6 1f7d: 66 0f ef c0 pxor %xmm0,%xmm0 1f81: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 1f85: e8 f6 f0 ff ff callq 1080 <__printf_chk@plt> 1f8a: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1f8e: ba 08 00 00 00 mov $0x8,%edx 1f93: 48 8d 35 51 93 00 00 lea 0x9351(%rip),%rsi # b2eb <_IO_stdin_used+0x2eb> 1f9a: 4c 89 ef mov %r13,%rdi 1f9d: e8 8e f0 ff ff callq 1030 1fa2: 85 c0 test %eax,%eax 1fa4: 0f 85 7f 0e 00 00 jne 2e29 1faa: 48 8d 35 0a 7a 00 00 lea 0x7a0a(%rip),%rsi # 99bb 1fb1: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 1fb7: 4c 89 f7 mov %r14,%rdi 1fba: e8 81 8a 00 00 callq aa40 1fbf: bf 01 00 00 00 mov $0x1,%edi 1fc4: b8 01 00 00 00 mov $0x1,%eax 1fc9: 48 8d 35 38 9d 00 00 lea 0x9d38(%rip),%rsi # bd08 <_IO_stdin_used+0xd08> 1fd0: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 1fd4: e8 a7 f0 ff ff callq 1080 <__printf_chk@plt> 1fd9: 4c 8b 6d 08 mov 0x8(%rbp),%r13 1fdd: ba 08 00 00 00 mov $0x8,%edx 1fe2: 48 8d 35 0e 93 00 00 lea 0x930e(%rip),%rsi # b2f7 <_IO_stdin_used+0x2f7> 1fe9: 4c 89 ef mov %r13,%rdi 1fec: e8 3f f0 ff ff callq 1030 1ff1: 85 c0 test %eax,%eax 1ff3: 0f 85 4c 0e 00 00 jne 2e45 1ff9: 48 8d 35 4c 79 00 00 lea 0x794c(%rip),%rsi # 994c 2000: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2006: 4c 89 f7 mov %r14,%rdi 2009: e8 32 8a 00 00 callq aa40 200e: bf 01 00 00 00 mov $0x1,%edi 2013: b8 01 00 00 00 mov $0x1,%eax 2018: 48 8d 35 c9 9c 00 00 lea 0x9cc9(%rip),%rsi # bce8 <_IO_stdin_used+0xce8> 201f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2023: e8 58 f0 ff ff callq 1080 <__printf_chk@plt> 2028: 4c 8b 6d 08 mov 0x8(%rbp),%r13 202c: ba 09 00 00 00 mov $0x9,%edx 2031: 48 8d 35 81 92 00 00 lea 0x9281(%rip),%rsi # b2b9 <_IO_stdin_used+0x2b9> 2038: 4c 89 ef mov %r13,%rdi 203b: e8 f0 ef ff ff callq 1030 2040: 85 c0 test %eax,%eax 2042: 0f 85 19 0e 00 00 jne 2e61 2048: 48 8d 35 65 63 00 00 lea 0x6365(%rip),%rsi # 83b4 204f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2055: 4c 89 f7 mov %r14,%rdi 2058: e8 e3 89 00 00 callq aa40 205d: bf 01 00 00 00 mov $0x1,%edi 2062: b8 01 00 00 00 mov $0x1,%eax 2067: 48 8d 35 52 9c 00 00 lea 0x9c52(%rip),%rsi # bcc0 <_IO_stdin_used+0xcc0> 206e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2072: e8 09 f0 ff ff callq 1080 <__printf_chk@plt> 2077: 4c 8b 6d 08 mov 0x8(%rbp),%r13 207b: ba 09 00 00 00 mov $0x9,%edx 2080: 48 8d 35 4e 93 00 00 lea 0x934e(%rip),%rsi # b3d5 <_IO_stdin_used+0x3d5> 2087: 4c 89 ef mov %r13,%rdi 208a: e8 a1 ef ff ff callq 1030 208f: 85 c0 test %eax,%eax 2091: 0f 85 e6 0d 00 00 jne 2e7d 2097: 48 8d 35 7f 67 00 00 lea 0x677f(%rip),%rsi # 881d 209e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 20a4: 4c 89 f7 mov %r14,%rdi 20a7: e8 94 89 00 00 callq aa40 20ac: bf 01 00 00 00 mov $0x1,%edi 20b1: b8 01 00 00 00 mov $0x1,%eax 20b6: 48 8d 35 db 9b 00 00 lea 0x9bdb(%rip),%rsi # bc98 <_IO_stdin_used+0xc98> 20bd: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 20c1: e8 ba ef ff ff callq 1080 <__printf_chk@plt> 20c6: 4c 8b 6d 08 mov 0x8(%rbp),%r13 20ca: ba 06 00 00 00 mov $0x6,%edx 20cf: 48 8d 35 2d 92 00 00 lea 0x922d(%rip),%rsi # b303 <_IO_stdin_used+0x303> 20d6: 4c 89 ef mov %r13,%rdi 20d9: e8 52 ef ff ff callq 1030 20de: 85 c0 test %eax,%eax 20e0: 0f 85 b3 0d 00 00 jne 2e99 20e6: 48 8d 35 c2 6c 00 00 lea 0x6cc2(%rip),%rsi # 8daf 20ed: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 20f3: bf 00 2f 68 59 mov $0x59682f00,%edi 20f8: e8 43 89 00 00 callq aa40 20fd: bf 01 00 00 00 mov $0x1,%edi 2102: b8 01 00 00 00 mov $0x1,%eax 2107: 48 8d 35 d0 93 00 00 lea 0x93d0(%rip),%rsi # b4de <_IO_stdin_used+0x4de> 210e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2112: e8 69 ef ff ff callq 1080 <__printf_chk@plt> 2117: 4c 8b 6d 08 mov 0x8(%rbp),%r13 211b: ba 06 00 00 00 mov $0x6,%edx 2120: 48 8d 35 8a 90 00 00 lea 0x908a(%rip),%rsi # b1b1 <_IO_stdin_used+0x1b1> 2127: 4c 89 ef mov %r13,%rdi 212a: e8 01 ef ff ff callq 1030 212f: 85 c0 test %eax,%eax 2131: 0f 85 7e 0d 00 00 jne 2eb5 2137: 48 8d 35 29 6d 00 00 lea 0x6d29(%rip),%rsi # 8e67 213e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2144: bf 00 2f 68 59 mov $0x59682f00,%edi 2149: e8 f2 88 00 00 callq aa40 214e: bf 01 00 00 00 mov $0x1,%edi 2153: b8 01 00 00 00 mov $0x1,%eax 2158: 48 8d 35 64 93 00 00 lea 0x9364(%rip),%rsi # b4c3 <_IO_stdin_used+0x4c3> 215f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2163: e8 18 ef ff ff callq 1080 <__printf_chk@plt> 2168: 4c 8b 6d 08 mov 0x8(%rbp),%r13 216c: ba 09 00 00 00 mov $0x9,%edx 2171: 48 8d 35 88 91 00 00 lea 0x9188(%rip),%rsi # b300 <_IO_stdin_used+0x300> 2178: 4c 89 ef mov %r13,%rdi 217b: e8 b0 ee ff ff callq 1030 2180: 85 c0 test %eax,%eax 2182: 0f 85 49 0d 00 00 jne 2ed1 2188: 48 8d 35 8a 75 00 00 lea 0x758a(%rip),%rsi # 9719 218f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2195: bf 00 2f 68 59 mov $0x59682f00,%edi 219a: e8 a1 88 00 00 callq aa40 219f: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 21a5: bf 01 00 00 00 mov $0x1,%edi 21aa: 48 8d 35 bf 9a 00 00 lea 0x9abf(%rip),%rsi # bc70 <_IO_stdin_used+0xc70> 21b1: b8 01 00 00 00 mov $0x1,%eax 21b6: f3 0f 5e e8 divss %xmm0,%xmm5 21ba: 66 0f ef c0 pxor %xmm0,%xmm0 21be: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 21c2: e8 b9 ee ff ff callq 1080 <__printf_chk@plt> 21c7: 4c 8b 6d 08 mov 0x8(%rbp),%r13 21cb: ba 09 00 00 00 mov $0x9,%edx 21d0: 48 8d 35 33 91 00 00 lea 0x9133(%rip),%rsi # b30a <_IO_stdin_used+0x30a> 21d7: 4c 89 ef mov %r13,%rdi 21da: e8 51 ee ff ff callq 1030 21df: 85 c0 test %eax,%eax 21e1: 0f 85 06 0d 00 00 jne 2eed 21e7: 48 8d 35 e3 75 00 00 lea 0x75e3(%rip),%rsi # 97d1 21ee: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 21f4: bf 00 2f 68 59 mov $0x59682f00,%edi 21f9: e8 42 88 00 00 callq aa40 21fe: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 2204: bf 01 00 00 00 mov $0x1,%edi 2209: 48 8d 35 38 9a 00 00 lea 0x9a38(%rip),%rsi # bc48 <_IO_stdin_used+0xc48> 2210: b8 01 00 00 00 mov $0x1,%eax 2215: f3 0f 5e f8 divss %xmm0,%xmm7 2219: 66 0f ef c0 pxor %xmm0,%xmm0 221d: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 2221: e8 5a ee ff ff callq 1080 <__printf_chk@plt> 2226: 4c 8b 6d 08 mov 0x8(%rbp),%r13 222a: ba 06 00 00 00 mov $0x6,%edx 222f: 48 8d 35 ec 90 00 00 lea 0x90ec(%rip),%rsi # b322 <_IO_stdin_used+0x322> 2236: 4c 89 ef mov %r13,%rdi 2239: e8 f2 ed ff ff callq 1030 223e: 85 c0 test %eax,%eax 2240: 0f 85 c3 0c 00 00 jne 2f09 2246: 48 8d 35 8f 68 00 00 lea 0x688f(%rip),%rsi # 8adc 224d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2253: bf 00 2f 68 59 mov $0x59682f00,%edi 2258: e8 e3 87 00 00 callq aa40 225d: bf 01 00 00 00 mov $0x1,%edi 2262: b8 01 00 00 00 mov $0x1,%eax 2267: 48 8d 35 39 92 00 00 lea 0x9239(%rip),%rsi # b4a7 <_IO_stdin_used+0x4a7> 226e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2272: e8 09 ee ff ff callq 1080 <__printf_chk@plt> 2277: 4c 8b 6d 08 mov 0x8(%rbp),%r13 227b: ba 06 00 00 00 mov $0x6,%edx 2280: 48 8d 35 8d 90 00 00 lea 0x908d(%rip),%rsi # b314 <_IO_stdin_used+0x314> 2287: 4c 89 ef mov %r13,%rdi 228a: e8 a1 ed ff ff callq 1030 228f: 85 c0 test %eax,%eax 2291: 0f 85 8e 0c 00 00 jne 2f25 2297: 48 8d 35 9f 67 00 00 lea 0x679f(%rip),%rsi # 8a3d 229e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 22a4: bf 00 2f 68 59 mov $0x59682f00,%edi 22a9: e8 92 87 00 00 callq aa40 22ae: bf 01 00 00 00 mov $0x1,%edi 22b3: b8 01 00 00 00 mov $0x1,%eax 22b8: 48 8d 35 cc 91 00 00 lea 0x91cc(%rip),%rsi # b48b <_IO_stdin_used+0x48b> 22bf: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 22c3: e8 b8 ed ff ff callq 1080 <__printf_chk@plt> 22c8: 4c 8b 6d 08 mov 0x8(%rbp),%r13 22cc: ba 0c 00 00 00 mov $0xc,%edx 22d1: 48 8d 35 44 90 00 00 lea 0x9044(%rip),%rsi # b31c <_IO_stdin_used+0x31c> 22d8: 4c 89 ef mov %r13,%rdi 22db: e8 50 ed ff ff callq 1030 22e0: 85 c0 test %eax,%eax 22e2: 0f 85 59 0c 00 00 jne 2f41 22e8: 48 8d 35 33 6c 00 00 lea 0x6c33(%rip),%rsi # 8f22 22ef: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 22f5: bf 05 7a d7 03 mov $0x3d77a05,%edi 22fa: 48 c1 e7 09 shl $0x9,%rdi 22fe: e8 3d 87 00 00 callq aa40 2303: bf 01 00 00 00 mov $0x1,%edi 2308: b8 01 00 00 00 mov $0x1,%eax 230d: 48 8d 35 0c 99 00 00 lea 0x990c(%rip),%rsi # bc20 <_IO_stdin_used+0xc20> 2314: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2318: e8 63 ed ff ff callq 1080 <__printf_chk@plt> 231d: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2321: ba 0b 00 00 00 mov $0xb,%edx 2326: 48 8d 35 fd 8f 00 00 lea 0x8ffd(%rip),%rsi # b32a <_IO_stdin_used+0x32a> 232d: 4c 89 ef mov %r13,%rdi 2330: e8 fb ec ff ff callq 1030 2335: 85 c0 test %eax,%eax 2337: 0f 85 20 0c 00 00 jne 2f5d 233d: 48 8d 35 8d 6e 00 00 lea 0x6e8d(%rip),%rsi # 91d1 2344: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 234a: bf 05 7a d7 03 mov $0x3d77a05,%edi 234f: 48 c1 e7 09 shl $0x9,%rdi 2353: e8 e8 86 00 00 callq aa40 2358: bf 01 00 00 00 mov $0x1,%edi 235d: b8 01 00 00 00 mov $0x1,%eax 2362: 48 8d 35 8f 98 00 00 lea 0x988f(%rip),%rsi # bbf8 <_IO_stdin_used+0xbf8> 2369: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 236d: e8 0e ed ff ff callq 1080 <__printf_chk@plt> 2372: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2376: ba 0e 00 00 00 mov $0xe,%edx 237b: 48 8d 35 b5 8f 00 00 lea 0x8fb5(%rip),%rsi # b337 <_IO_stdin_used+0x337> 2382: 4c 89 ef mov %r13,%rdi 2385: e8 a6 ec ff ff callq 1030 238a: 85 c0 test %eax,%eax 238c: 0f 85 e7 0b 00 00 jne 2f79 2392: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2398: bf 05 7a d7 03 mov $0x3d77a05,%edi 239d: 48 8d 35 8c 86 00 00 lea 0x868c(%rip),%rsi # aa30 23a4: 48 c1 e7 09 shl $0x9,%rdi 23a8: e8 93 86 00 00 callq aa40 23ad: bf 01 00 00 00 mov $0x1,%edi 23b2: b8 01 00 00 00 mov $0x1,%eax 23b7: 48 8d 35 0a 98 00 00 lea 0x980a(%rip),%rsi # bbc8 <_IO_stdin_used+0xbc8> 23be: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 23c2: e8 b9 ec ff ff callq 1080 <__printf_chk@plt> 23c7: 4c 8b 6d 08 mov 0x8(%rbp),%r13 23cb: ba 0b 00 00 00 mov $0xb,%edx 23d0: 48 8d 35 70 8f 00 00 lea 0x8f70(%rip),%rsi # b347 <_IO_stdin_used+0x347> 23d7: 4c 89 ef mov %r13,%rdi 23da: e8 51 ec ff ff callq 1030 23df: 85 c0 test %eax,%eax 23e1: 75 39 jne 241c 23e3: 48 8d 35 98 6e 00 00 lea 0x6e98(%rip),%rsi # 9282 23ea: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 23f0: bf 05 7a d7 03 mov $0x3d77a05,%edi 23f5: 48 c1 e7 09 shl $0x9,%rdi 23f9: e8 42 86 00 00 callq aa40 23fe: bf 01 00 00 00 mov $0x1,%edi 2403: b8 01 00 00 00 mov $0x1,%eax 2408: 48 8d 35 91 97 00 00 lea 0x9791(%rip),%rsi # bba0 <_IO_stdin_used+0xba0> 240f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2413: e8 68 ec ff ff callq 1080 <__printf_chk@plt> 2418: 4c 8b 6d 08 mov 0x8(%rbp),%r13 241c: ba 0e 00 00 00 mov $0xe,%edx 2421: 48 8d 35 0f 8f 00 00 lea 0x8f0f(%rip),%rsi # b337 <_IO_stdin_used+0x337> 2428: 4c 89 ef mov %r13,%rdi 242b: e8 00 ec ff ff callq 1030 2430: 85 c0 test %eax,%eax 2432: 0f 85 5d 0b 00 00 jne 2f95 2438: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 243e: bf 05 7a d7 03 mov $0x3d77a05,%edi 2443: 48 8d 35 d6 85 00 00 lea 0x85d6(%rip),%rsi # aa20 244a: 48 c1 e7 09 shl $0x9,%rdi 244e: e8 ed 85 00 00 callq aa40 2453: bf 01 00 00 00 mov $0x1,%edi 2458: b8 01 00 00 00 mov $0x1,%eax 245d: 48 8d 35 e4 9b 00 00 lea 0x9be4(%rip),%rsi # c048 <_IO_stdin_used+0x1048> 2464: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2468: e8 13 ec ff ff callq 1080 <__printf_chk@plt> 246d: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2471: ba 0d 00 00 00 mov $0xd,%edx 2476: 48 8d 35 9a 8c 00 00 lea 0x8c9a(%rip),%rsi # b117 <_IO_stdin_used+0x117> 247d: 4c 89 ef mov %r13,%rdi 2480: e8 ab eb ff ff callq 1030 2485: 85 c0 test %eax,%eax 2487: 0f 85 24 0b 00 00 jne 2fb1 248d: 48 8d 35 04 71 00 00 lea 0x7104(%rip),%rsi # 9598 2494: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 249a: bf 05 7a d7 03 mov $0x3d77a05,%edi 249f: 48 c1 e7 09 shl $0x9,%rdi 24a3: e8 98 85 00 00 callq aa40 24a8: bf 01 00 00 00 mov $0x1,%edi 24ad: b8 01 00 00 00 mov $0x1,%eax 24b2: 48 8d 35 b7 96 00 00 lea 0x96b7(%rip),%rsi # bb70 <_IO_stdin_used+0xb70> 24b9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 24bd: e8 be eb ff ff callq 1080 <__printf_chk@plt> 24c2: 4c 8b 6d 08 mov 0x8(%rbp),%r13 24c6: ba 0f 00 00 00 mov $0xf,%edx 24cb: 48 8d 35 82 8e 00 00 lea 0x8e82(%rip),%rsi # b354 <_IO_stdin_used+0x354> 24d2: 4c 89 ef mov %r13,%rdi 24d5: e8 56 eb ff ff callq 1030 24da: 85 c0 test %eax,%eax 24dc: 0f 85 eb 0a 00 00 jne 2fcd 24e2: 48 8d 35 0b 57 00 00 lea 0x570b(%rip),%rsi # 7bf4 24e9: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 24ef: bf 00 2f 68 59 mov $0x59682f00,%edi 24f4: e8 47 85 00 00 callq aa40 24f9: bf 01 00 00 00 mov $0x1,%edi 24fe: b8 01 00 00 00 mov $0x1,%eax 2503: 48 8d 35 3e 96 00 00 lea 0x963e(%rip),%rsi # bb48 <_IO_stdin_used+0xb48> 250a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 250e: e8 6d eb ff ff callq 1080 <__printf_chk@plt> 2513: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2517: ba 08 00 00 00 mov $0x8,%edx 251c: 48 8d 35 42 8e 00 00 lea 0x8e42(%rip),%rsi # b365 <_IO_stdin_used+0x365> 2523: 4c 89 ef mov %r13,%rdi 2526: e8 05 eb ff ff callq 1030 252b: 85 c0 test %eax,%eax 252d: 0f 85 b6 0a 00 00 jne 2fe9 2533: 48 8d 35 90 75 00 00 lea 0x7590(%rip),%rsi # 9aca 253a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2540: bf 00 2f 68 59 mov $0x59682f00,%edi 2545: e8 f6 84 00 00 callq aa40 254a: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 2550: bf 01 00 00 00 mov $0x1,%edi 2555: 48 8d 35 c4 95 00 00 lea 0x95c4(%rip),%rsi # bb20 <_IO_stdin_used+0xb20> 255c: b8 01 00 00 00 mov $0x1,%eax 2561: f3 0f 5e e8 divss %xmm0,%xmm5 2565: 66 0f ef c0 pxor %xmm0,%xmm0 2569: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 256d: e8 0e eb ff ff callq 1080 <__printf_chk@plt> 2572: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2576: ba 08 00 00 00 mov $0x8,%edx 257b: 48 8d 35 ec 8d 00 00 lea 0x8dec(%rip),%rsi # b36e <_IO_stdin_used+0x36e> 2582: 4c 89 ef mov %r13,%rdi 2585: e8 a6 ea ff ff callq 1030 258a: 85 c0 test %eax,%eax 258c: 0f 85 73 0a 00 00 jne 3005 2592: 48 8d 35 91 74 00 00 lea 0x7491(%rip),%rsi # 9a2a 2599: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 259f: bf 00 2f 68 59 mov $0x59682f00,%edi 25a4: e8 97 84 00 00 callq aa40 25a9: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 25af: bf 01 00 00 00 mov $0x1,%edi 25b4: 48 8d 35 3d 95 00 00 lea 0x953d(%rip),%rsi # baf8 <_IO_stdin_used+0xaf8> 25bb: b8 01 00 00 00 mov $0x1,%eax 25c0: f3 0f 5e e8 divss %xmm0,%xmm5 25c4: 66 0f ef c0 pxor %xmm0,%xmm0 25c8: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 25cc: e8 af ea ff ff callq 1080 <__printf_chk@plt> 25d1: 4c 8b 6d 08 mov 0x8(%rbp),%r13 25d5: ba 05 00 00 00 mov $0x5,%edx 25da: 48 8d 35 87 8d 00 00 lea 0x8d87(%rip),%rsi # b368 <_IO_stdin_used+0x368> 25e1: 4c 89 ef mov %r13,%rdi 25e4: e8 47 ea ff ff callq 1030 25e9: 85 c0 test %eax,%eax 25eb: 0f 85 30 0a 00 00 jne 3021 25f1: 48 8d 35 86 75 00 00 lea 0x7586(%rip),%rsi # 9b7e 25f8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 25fe: bf 00 2f 68 59 mov $0x59682f00,%edi 2603: e8 38 84 00 00 callq aa40 2608: bf 01 00 00 00 mov $0x1,%edi 260d: b8 01 00 00 00 mov $0x1,%eax 2612: 48 8d 35 57 8e 00 00 lea 0x8e57(%rip),%rsi # b470 <_IO_stdin_used+0x470> 2619: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 261d: e8 5e ea ff ff callq 1080 <__printf_chk@plt> 2622: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2626: ba 05 00 00 00 mov $0x5,%edx 262b: 48 8d 35 5d 8d 00 00 lea 0x8d5d(%rip),%rsi # b38f <_IO_stdin_used+0x38f> 2632: 4c 89 ef mov %r13,%rdi 2635: e8 f6 e9 ff ff callq 1030 263a: 85 c0 test %eax,%eax 263c: 0f 85 fb 09 00 00 jne 303d 2642: 48 8d 35 e9 75 00 00 lea 0x75e9(%rip),%rsi # 9c32 2649: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 264f: bf 00 2f 68 59 mov $0x59682f00,%edi 2654: e8 e7 83 00 00 callq aa40 2659: bf 01 00 00 00 mov $0x1,%edi 265e: b8 01 00 00 00 mov $0x1,%eax 2663: 48 8d 35 eb 8d 00 00 lea 0x8deb(%rip),%rsi # b455 <_IO_stdin_used+0x455> 266a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 266e: e8 0d ea ff ff callq 1080 <__printf_chk@plt> 2673: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2677: ba 05 00 00 00 mov $0x5,%edx 267c: 48 8d 35 f4 8c 00 00 lea 0x8cf4(%rip),%rsi # b377 <_IO_stdin_used+0x377> 2683: 4c 89 ef mov %r13,%rdi 2686: e8 a5 e9 ff ff callq 1030 268b: 85 c0 test %eax,%eax 268d: 0f 85 c6 09 00 00 jne 3059 2693: 48 8d 35 7a 76 00 00 lea 0x767a(%rip),%rsi # 9d14 269a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 26a0: bf 00 2f 68 59 mov $0x59682f00,%edi 26a5: e8 96 83 00 00 callq aa40 26aa: bf 01 00 00 00 mov $0x1,%edi 26af: b8 01 00 00 00 mov $0x1,%eax 26b4: 48 8d 35 0d 94 00 00 lea 0x940d(%rip),%rsi # bac8 <_IO_stdin_used+0xac8> 26bb: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 26bf: e8 bc e9 ff ff callq 1080 <__printf_chk@plt> 26c4: 4c 8b 6d 08 mov 0x8(%rbp),%r13 26c8: ba 05 00 00 00 mov $0x5,%edx 26cd: 48 8d 35 b1 8c 00 00 lea 0x8cb1(%rip),%rsi # b385 <_IO_stdin_used+0x385> 26d4: 4c 89 ef mov %r13,%rdi 26d7: e8 54 e9 ff ff callq 1030 26dc: 85 c0 test %eax,%eax 26de: 0f 85 91 09 00 00 jne 3075 26e4: 48 8d 35 d5 76 00 00 lea 0x76d5(%rip),%rsi # 9dc0 26eb: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 26f1: bf 00 2f 68 59 mov $0x59682f00,%edi 26f6: e8 45 83 00 00 callq aa40 26fb: bf 01 00 00 00 mov $0x1,%edi 2700: b8 01 00 00 00 mov $0x1,%eax 2705: 48 8d 35 8c 93 00 00 lea 0x938c(%rip),%rsi # ba98 <_IO_stdin_used+0xa98> 270c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2710: e8 6b e9 ff ff callq 1080 <__printf_chk@plt> 2715: 4c 8b 6d 08 mov 0x8(%rbp),%r13 2719: ba 07 00 00 00 mov $0x7,%edx 271e: 48 8d 35 76 8c 00 00 lea 0x8c76(%rip),%rsi # b39b <_IO_stdin_used+0x39b> 2725: 4c 89 ef mov %r13,%rdi 2728: e8 03 e9 ff ff callq 1030 272d: 85 c0 test %eax,%eax 272f: 0f 85 5c 09 00 00 jne 3091 2735: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 273b: 48 8d 35 2e 82 00 00 lea 0x822e(%rip),%rsi # a970 2742: bf 00 2f 68 59 mov $0x59682f00,%edi 2747: e8 f4 82 00 00 callq aa40 274c: bf 01 00 00 00 mov $0x1,%edi 2751: b8 01 00 00 00 mov $0x1,%eax 2756: 48 8d 35 db 8c 00 00 lea 0x8cdb(%rip),%rsi # b438 <_IO_stdin_used+0x438> 275d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2761: e8 1a e9 ff ff callq 1080 <__printf_chk@plt> 2766: 4c 8b 6d 08 mov 0x8(%rbp),%r13 276a: ba 0d 00 00 00 mov $0xd,%edx 276f: 48 8d 35 1f 8c 00 00 lea 0x8c1f(%rip),%rsi # b395 <_IO_stdin_used+0x395> 2776: 4c 89 ef mov %r13,%rdi 2779: e8 b2 e8 ff ff callq 1030 277e: 85 c0 test %eax,%eax 2780: 0f 85 27 09 00 00 jne 30ad 2786: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 278c: 48 8d 35 ed 81 00 00 lea 0x81ed(%rip),%rsi # a980 2793: bf 00 2f 68 59 mov $0x59682f00,%edi 2798: e8 a3 82 00 00 callq aa40 279d: bf 01 00 00 00 mov $0x1,%edi 27a2: b8 01 00 00 00 mov $0x1,%eax 27a7: 48 8d 35 c2 92 00 00 lea 0x92c2(%rip),%rsi # ba70 <_IO_stdin_used+0xa70> 27ae: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 27b2: e8 c9 e8 ff ff callq 1080 <__printf_chk@plt> 27b7: 4c 8b 6d 08 mov 0x8(%rbp),%r13 27bb: ba 07 00 00 00 mov $0x7,%edx 27c0: 48 8d 35 dc 8b 00 00 lea 0x8bdc(%rip),%rsi # b3a3 <_IO_stdin_used+0x3a3> 27c7: 4c 89 ef mov %r13,%rdi 27ca: e8 61 e8 ff ff callq 1030 27cf: 85 c0 test %eax,%eax 27d1: 0f 85 f2 08 00 00 jne 30c9 27d7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 27dd: 48 8d 35 bc 81 00 00 lea 0x81bc(%rip),%rsi # a9a0 27e4: bf 00 2f 68 59 mov $0x59682f00,%edi 27e9: e8 52 82 00 00 callq aa40 27ee: bf 01 00 00 00 mov $0x1,%edi 27f3: b8 01 00 00 00 mov $0x1,%eax 27f8: 48 8d 35 1c 8c 00 00 lea 0x8c1c(%rip),%rsi # b41b <_IO_stdin_used+0x41b> 27ff: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2803: e8 78 e8 ff ff callq 1080 <__printf_chk@plt> 2808: 4c 8b 6d 08 mov 0x8(%rbp),%r13 280c: ba 0d 00 00 00 mov $0xd,%edx 2811: 48 8d 35 93 8b 00 00 lea 0x8b93(%rip),%rsi # b3ab <_IO_stdin_used+0x3ab> 2818: 4c 89 ef mov %r13,%rdi 281b: e8 10 e8 ff ff callq 1030 2820: 85 c0 test %eax,%eax 2822: 0f 85 bd 08 00 00 jne 30e5 2828: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 282e: 48 8d 35 5b 81 00 00 lea 0x815b(%rip),%rsi # a990 2835: bf 00 2f 68 59 mov $0x59682f00,%edi 283a: e8 01 82 00 00 callq aa40 283f: bf 01 00 00 00 mov $0x1,%edi 2844: b8 01 00 00 00 mov $0x1,%eax 2849: 48 8d 35 f8 91 00 00 lea 0x91f8(%rip),%rsi # ba48 <_IO_stdin_used+0xa48> 2850: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2854: e8 27 e8 ff ff callq 1080 <__printf_chk@plt> 2859: 4c 8b 6d 08 mov 0x8(%rbp),%r13 285d: ba 07 00 00 00 mov $0x7,%edx 2862: 48 8d 35 54 8b 00 00 lea 0x8b54(%rip),%rsi # b3bd <_IO_stdin_used+0x3bd> 2869: 4c 89 ef mov %r13,%rdi 286c: e8 bf e7 ff ff callq 1030 2871: 85 c0 test %eax,%eax 2873: 75 35 jne 28aa 2875: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 287b: 48 8d 35 3e 81 00 00 lea 0x813e(%rip),%rsi # a9c0 2882: bf 00 2f 68 59 mov $0x59682f00,%edi 2887: e8 b4 81 00 00 callq aa40 288c: bf 01 00 00 00 mov $0x1,%edi 2891: b8 01 00 00 00 mov $0x1,%eax 2896: 48 8d 35 60 8b 00 00 lea 0x8b60(%rip),%rsi # b3fd <_IO_stdin_used+0x3fd> 289d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 28a1: e8 da e7 ff ff callq 1080 <__printf_chk@plt> 28a6: 4c 8b 6d 08 mov 0x8(%rbp),%r13 28aa: ba 07 00 00 00 mov $0x7,%edx 28af: 48 8d 35 10 8b 00 00 lea 0x8b10(%rip),%rsi # b3c6 <_IO_stdin_used+0x3c6> 28b6: 4c 89 ef mov %r13,%rdi 28b9: e8 72 e7 ff ff callq 1030 28be: 85 c0 test %eax,%eax 28c0: 75 3b jne 28fd 28c2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 28c8: 48 8d 35 11 81 00 00 lea 0x8111(%rip),%rsi # a9e0 28cf: bf 00 2f 68 59 mov $0x59682f00,%edi 28d4: e8 67 81 00 00 callq aa40 28d9: bf 01 00 00 00 mov $0x1,%edi 28de: b8 01 00 00 00 mov $0x1,%eax 28e3: 48 8d 35 f5 8a 00 00 lea 0x8af5(%rip),%rsi # b3df <_IO_stdin_used+0x3df> 28ea: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 28ee: e8 8d e7 ff ff callq 1080 <__printf_chk@plt> 28f3: 41 83 ec 01 sub $0x1,%r12d 28f7: 7e 4d jle 2946 28f9: 4c 8b 6d 08 mov 0x8(%rbp),%r13 28fd: ba 0f 00 00 00 mov $0xf,%edx 2902: 48 8d 35 c6 8a 00 00 lea 0x8ac6(%rip),%rsi # b3cf <_IO_stdin_used+0x3cf> 2909: 4c 89 ef mov %r13,%rdi 290c: e8 1f e7 ff ff callq 1030 2911: 85 c0 test %eax,%eax 2913: 75 31 jne 2946 2915: 48 8d 35 ff 5f 00 00 lea 0x5fff(%rip),%rsi # 891b 291c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 2922: bf 00 2f 68 59 mov $0x59682f00,%edi 2927: e8 14 81 00 00 callq aa40 292c: bf 01 00 00 00 mov $0x1,%edi 2931: b8 01 00 00 00 mov $0x1,%eax 2936: 48 8d 35 db 90 00 00 lea 0x90db(%rip),%rsi # ba18 <_IO_stdin_used+0xa18> 293d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 2941: e8 3a e7 ff ff callq 1080 <__printf_chk@plt> 2946: 48 8b 44 24 48 mov 0x48(%rsp),%rax 294b: 64 48 33 04 25 28 00 xor %fs:0x28,%rax 2952: 00 00 2954: 0f 85 ce 3b 00 00 jne 6528 295a: 48 83 c4 58 add $0x58,%rsp 295e: 31 c0 xor %eax,%eax 2960: 5b pop %rbx 2961: 5d pop %rbp 2962: 41 5c pop %r12 2964: 41 5d pop %r13 2966: 41 5e pop %r14 2968: 41 5f pop %r15 296a: c3 retq 296b: ba 05 00 00 00 mov $0x5,%edx 2970: 48 8d 35 f7 8d 00 00 lea 0x8df7(%rip),%rsi # b76e <_IO_stdin_used+0x76e> 2977: 4c 89 ef mov %r13,%rdi 297a: e8 b1 e6 ff ff callq 1030 297f: 85 c0 test %eax,%eax 2981: 0f 84 34 3b 00 00 je 64bb 2987: f3 0f 10 35 f9 96 00 movss 0x96f9(%rip),%xmm6 # c088 <_IO_stdin_used+0x1088> 298e: 00 298f: f3 0f 11 74 24 0c movss %xmm6,0xc(%rsp) 2995: f3 0f 11 74 24 08 movss %xmm6,0x8(%rsp) 299b: ba 05 00 00 00 mov $0x5,%edx 29a0: 48 8d 35 1e 88 00 00 lea 0x881e(%rip),%rsi # b1c5 <_IO_stdin_used+0x1c5> 29a7: 4c 89 ef mov %r13,%rdi 29aa: e8 81 e6 ff ff callq 1030 29af: 85 c0 test %eax,%eax 29b1: 0f 84 43 2f 00 00 je 58fa 29b7: ba 03 00 00 00 mov $0x3,%edx 29bc: 48 8d 35 4c 88 00 00 lea 0x884c(%rip),%rsi # b20f <_IO_stdin_used+0x20f> 29c3: 4c 89 ef mov %r13,%rdi 29c6: e8 65 e6 ff ff callq 1030 29cb: 85 c0 test %eax,%eax 29cd: 0f 84 e9 2e 00 00 je 58bc 29d3: 48 8d 35 f1 87 00 00 lea 0x87f1(%rip),%rsi # b1cb <_IO_stdin_used+0x1cb> 29da: 4c 89 ef mov %r13,%rdi 29dd: e8 7e e6 ff ff callq 1060 29e2: 85 c0 test %eax,%eax 29e4: 0f 84 91 2e 00 00 je 587b 29ea: 48 8d 35 e1 87 00 00 lea 0x87e1(%rip),%rsi # b1d2 <_IO_stdin_used+0x1d2> 29f1: 4c 89 ef mov %r13,%rdi 29f4: e8 67 e6 ff ff callq 1060 29f9: 85 c0 test %eax,%eax 29fb: 0f 84 3c 2e 00 00 je 583d 2a01: ba 06 00 00 00 mov $0x6,%edx 2a06: 48 8d 35 ce 87 00 00 lea 0x87ce(%rip),%rsi # b1db <_IO_stdin_used+0x1db> 2a0d: 4c 89 ef mov %r13,%rdi 2a10: e8 1b e6 ff ff callq 1030 2a15: 85 c0 test %eax,%eax 2a17: 0f 84 e2 2d 00 00 je 57ff 2a1d: ba 08 00 00 00 mov $0x8,%edx 2a22: 48 8d 35 b0 87 00 00 lea 0x87b0(%rip),%rsi # b1d9 <_IO_stdin_used+0x1d9> 2a29: 4c 89 ef mov %r13,%rdi 2a2c: e8 ff e5 ff ff callq 1030 2a31: 85 c0 test %eax,%eax 2a33: 0f 84 88 2d 00 00 je 57c1 2a39: ba 07 00 00 00 mov $0x7,%edx 2a3e: 48 8d 35 9d 87 00 00 lea 0x879d(%rip),%rsi # b1e2 <_IO_stdin_used+0x1e2> 2a45: 4c 89 ef mov %r13,%rdi 2a48: e8 e3 e5 ff ff callq 1030 2a4d: 85 c0 test %eax,%eax 2a4f: 0f 84 2e 2d 00 00 je 5783 2a55: ba 07 00 00 00 mov $0x7,%edx 2a5a: 48 8d 35 89 87 00 00 lea 0x8789(%rip),%rsi # b1ea <_IO_stdin_used+0x1ea> 2a61: 4c 89 ef mov %r13,%rdi 2a64: e8 c7 e5 ff ff callq 1030 2a69: 85 c0 test %eax,%eax 2a6b: 0f 84 d4 2c 00 00 je 5745 2a71: ba 07 00 00 00 mov $0x7,%edx 2a76: 48 8d 35 75 87 00 00 lea 0x8775(%rip),%rsi # b1f2 <_IO_stdin_used+0x1f2> 2a7d: 4c 89 ef mov %r13,%rdi 2a80: e8 ab e5 ff ff callq 1030 2a85: 85 c0 test %eax,%eax 2a87: 0f 84 7a 2c 00 00 je 5707 2a8d: ba 06 00 00 00 mov $0x6,%edx 2a92: 48 8d 35 61 87 00 00 lea 0x8761(%rip),%rsi # b1fa <_IO_stdin_used+0x1fa> 2a99: 4c 89 ef mov %r13,%rdi 2a9c: e8 8f e5 ff ff callq 1030 2aa1: 85 c0 test %eax,%eax 2aa3: 0f 84 20 2c 00 00 je 56c9 2aa9: ba 06 00 00 00 mov $0x6,%edx 2aae: 48 8d 35 4c 87 00 00 lea 0x874c(%rip),%rsi # b201 <_IO_stdin_used+0x201> 2ab5: 4c 89 ef mov %r13,%rdi 2ab8: e8 73 e5 ff ff callq 1030 2abd: 85 c0 test %eax,%eax 2abf: 0f 84 c6 2b 00 00 je 568b 2ac5: ba 06 00 00 00 mov $0x6,%edx 2aca: 48 8d 35 96 8c 00 00 lea 0x8c96(%rip),%rsi # b767 <_IO_stdin_used+0x767> 2ad1: 4c 89 ef mov %r13,%rdi 2ad4: e8 57 e5 ff ff callq 1030 2ad9: 85 c0 test %eax,%eax 2adb: 0f 84 6c 2b 00 00 je 564d 2ae1: ba 0a 00 00 00 mov $0xa,%edx 2ae6: 48 8d 35 1b 87 00 00 lea 0x871b(%rip),%rsi # b208 <_IO_stdin_used+0x208> 2aed: 4c 89 ef mov %r13,%rdi 2af0: e8 3b e5 ff ff callq 1030 2af5: 85 c0 test %eax,%eax 2af7: 0f 84 12 2b 00 00 je 560f 2afd: ba 06 00 00 00 mov $0x6,%edx 2b02: 48 8d 35 0a 87 00 00 lea 0x870a(%rip),%rsi # b213 <_IO_stdin_used+0x213> 2b09: 4c 89 ef mov %r13,%rdi 2b0c: e8 1f e5 ff ff callq 1030 2b11: 85 c0 test %eax,%eax 2b13: 0f 84 b8 2a 00 00 je 55d1 2b19: ba 03 00 00 00 mov $0x3,%edx 2b1e: 48 8d 35 f7 86 00 00 lea 0x86f7(%rip),%rsi # b21c <_IO_stdin_used+0x21c> 2b25: 4c 89 ef mov %r13,%rdi 2b28: e8 03 e5 ff ff callq 1030 2b2d: 85 c0 test %eax,%eax 2b2f: 0f 84 5e 2a 00 00 je 5593 2b35: ba 05 00 00 00 mov $0x5,%edx 2b3a: 48 8d 35 d9 86 00 00 lea 0x86d9(%rip),%rsi # b21a <_IO_stdin_used+0x21a> 2b41: 4c 89 ef mov %r13,%rdi 2b44: e8 e7 e4 ff ff callq 1030 2b49: 85 c0 test %eax,%eax 2b4b: 0f 84 04 2a 00 00 je 5555 2b51: ba 04 00 00 00 mov $0x4,%edx 2b56: 48 8d 35 c3 86 00 00 lea 0x86c3(%rip),%rsi # b220 <_IO_stdin_used+0x220> 2b5d: 4c 89 ef mov %r13,%rdi 2b60: e8 cb e4 ff ff callq 1030 2b65: 85 c0 test %eax,%eax 2b67: 0f 84 aa 29 00 00 je 5517 2b6d: ba 04 00 00 00 mov $0x4,%edx 2b72: 48 8d 35 ac 86 00 00 lea 0x86ac(%rip),%rsi # b225 <_IO_stdin_used+0x225> 2b79: 4c 89 ef mov %r13,%rdi 2b7c: e8 af e4 ff ff callq 1030 2b81: 85 c0 test %eax,%eax 2b83: 0f 84 50 29 00 00 je 54d9 2b89: ba 07 00 00 00 mov $0x7,%edx 2b8e: 48 8d 35 95 86 00 00 lea 0x8695(%rip),%rsi # b22a <_IO_stdin_used+0x22a> 2b95: 4c 89 ef mov %r13,%rdi 2b98: e8 93 e4 ff ff callq 1030 2b9d: 85 c0 test %eax,%eax 2b9f: 0f 84 f6 28 00 00 je 549b 2ba5: ba 03 00 00 00 mov $0x3,%edx 2baa: 48 8d 35 8b 86 00 00 lea 0x868b(%rip),%rsi # b23c <_IO_stdin_used+0x23c> 2bb1: 4c 89 ef mov %r13,%rdi 2bb4: e8 77 e4 ff ff callq 1030 2bb9: 85 c0 test %eax,%eax 2bbb: 0f 84 9c 28 00 00 je 545d 2bc1: ba 03 00 00 00 mov $0x3,%edx 2bc6: 48 8d 35 65 86 00 00 lea 0x8665(%rip),%rsi # b232 <_IO_stdin_used+0x232> 2bcd: 4c 89 ef mov %r13,%rdi 2bd0: e8 5b e4 ff ff callq 1030 2bd5: 85 c0 test %eax,%eax 2bd7: 0f 84 42 28 00 00 je 541f 2bdd: ba 09 00 00 00 mov $0x9,%edx 2be2: 48 8d 35 4d 86 00 00 lea 0x864d(%rip),%rsi # b236 <_IO_stdin_used+0x236> 2be9: 4c 89 ef mov %r13,%rdi 2bec: e8 3f e4 ff ff callq 1030 2bf1: 85 c0 test %eax,%eax 2bf3: 0f 84 e8 27 00 00 je 53e1 2bf9: ba 03 00 00 00 mov $0x3,%edx 2bfe: 48 8d 35 3b 86 00 00 lea 0x863b(%rip),%rsi # b240 <_IO_stdin_used+0x240> 2c05: 4c 89 ef mov %r13,%rdi 2c08: e8 23 e4 ff ff callq 1030 2c0d: 85 c0 test %eax,%eax 2c0f: 0f 84 8e 27 00 00 je 53a3 2c15: ba 03 00 00 00 mov $0x3,%edx 2c1a: 48 8d 35 2f 86 00 00 lea 0x862f(%rip),%rsi # b250 <_IO_stdin_used+0x250> 2c21: 4c 89 ef mov %r13,%rdi 2c24: e8 07 e4 ff ff callq 1030 2c29: 85 c0 test %eax,%eax 2c2b: 0f 84 34 27 00 00 je 5365 2c31: ba 09 00 00 00 mov $0x9,%edx 2c36: 48 8d 35 0d 86 00 00 lea 0x860d(%rip),%rsi # b24a <_IO_stdin_used+0x24a> 2c3d: 4c 89 ef mov %r13,%rdi 2c40: e8 eb e3 ff ff callq 1030 2c45: 85 c0 test %eax,%eax 2c47: 0f 84 da 26 00 00 je 5327 2c4d: ba 09 00 00 00 mov $0x9,%edx 2c52: 48 8d 35 fb 85 00 00 lea 0x85fb(%rip),%rsi # b254 <_IO_stdin_used+0x254> 2c59: 4c 89 ef mov %r13,%rdi 2c5c: e8 cf e3 ff ff callq 1030 2c61: 85 c0 test %eax,%eax 2c63: 0f 84 80 26 00 00 je 52e9 2c69: ba 03 00 00 00 mov $0x3,%edx 2c6e: 48 8d 35 ef 85 00 00 lea 0x85ef(%rip),%rsi # b264 <_IO_stdin_used+0x264> 2c75: 4c 89 ef mov %r13,%rdi 2c78: e8 b3 e3 ff ff callq 1030 2c7d: 85 c0 test %eax,%eax 2c7f: 0f 84 26 26 00 00 je 52ab 2c85: ba 09 00 00 00 mov $0x9,%edx 2c8a: 48 8d 35 cd 85 00 00 lea 0x85cd(%rip),%rsi # b25e <_IO_stdin_used+0x25e> 2c91: 4c 89 ef mov %r13,%rdi 2c94: e8 97 e3 ff ff callq 1030 2c99: 85 c0 test %eax,%eax 2c9b: 0f 84 cc 25 00 00 je 526d 2ca1: ba 09 00 00 00 mov $0x9,%edx 2ca6: 48 8d 35 cd 85 00 00 lea 0x85cd(%rip),%rsi # b27a <_IO_stdin_used+0x27a> 2cad: 4c 89 ef mov %r13,%rdi 2cb0: e8 7b e3 ff ff callq 1030 2cb5: 85 c0 test %eax,%eax 2cb7: 0f 84 72 25 00 00 je 522f 2cbd: ba 0c 00 00 00 mov $0xc,%edx 2cc2: 48 8d 35 9f 85 00 00 lea 0x859f(%rip),%rsi # b268 <_IO_stdin_used+0x268> 2cc9: 4c 89 ef mov %r13,%rdi 2ccc: e8 5f e3 ff ff callq 1030 2cd1: 85 c0 test %eax,%eax 2cd3: 0f 84 18 25 00 00 je 51f1 2cd9: ba 0e 00 00 00 mov $0xe,%edx 2cde: 48 8d 35 90 85 00 00 lea 0x8590(%rip),%rsi # b275 <_IO_stdin_used+0x275> 2ce5: 4c 89 ef mov %r13,%rdi 2ce8: e8 43 e3 ff ff callq 1030 2ced: 85 c0 test %eax,%eax 2cef: 0f 84 be 24 00 00 je 51b3 2cf5: ba 0e 00 00 00 mov $0xe,%edx 2cfa: 48 8d 35 83 85 00 00 lea 0x8583(%rip),%rsi # b284 <_IO_stdin_used+0x284> 2d01: 4c 89 ef mov %r13,%rdi 2d04: e8 27 e3 ff ff callq 1030 2d09: 85 c0 test %eax,%eax 2d0b: 0f 84 64 24 00 00 je 5175 2d11: ba 08 00 00 00 mov $0x8,%edx 2d16: 48 8d 35 76 85 00 00 lea 0x8576(%rip),%rsi # b293 <_IO_stdin_used+0x293> 2d1d: 4c 89 ef mov %r13,%rdi 2d20: e8 0b e3 ff ff callq 1030 2d25: 85 c0 test %eax,%eax 2d27: 0f 84 0a 24 00 00 je 5137 2d2d: ba 0c 00 00 00 mov $0xc,%edx 2d32: 48 8d 35 63 85 00 00 lea 0x8563(%rip),%rsi # b29c <_IO_stdin_used+0x29c> 2d39: 4c 89 ef mov %r13,%rdi 2d3c: e8 ef e2 ff ff callq 1030 2d41: 85 c0 test %eax,%eax 2d43: 0f 84 a2 23 00 00 je 50eb 2d49: ba 0c 00 00 00 mov $0xc,%edx 2d4e: 48 8d 35 54 85 00 00 lea 0x8554(%rip),%rsi # b2a9 <_IO_stdin_used+0x2a9> 2d55: 4c 89 ef mov %r13,%rdi 2d58: e8 d3 e2 ff ff callq 1030 2d5d: 85 c0 test %eax,%eax 2d5f: 0f 84 38 23 00 00 je 509d 2d65: ba 0c 00 00 00 mov $0xc,%edx 2d6a: 48 8d 35 45 85 00 00 lea 0x8545(%rip),%rsi # b2b6 <_IO_stdin_used+0x2b6> 2d71: 4c 89 ef mov %r13,%rdi 2d74: e8 b7 e2 ff ff callq 1030 2d79: 85 c0 test %eax,%eax 2d7b: 0f 84 d0 22 00 00 je 5051 2d81: ba 0c 00 00 00 mov $0xc,%edx 2d86: 48 8d 35 36 85 00 00 lea 0x8536(%rip),%rsi # b2c3 <_IO_stdin_used+0x2c3> 2d8d: 4c 89 ef mov %r13,%rdi 2d90: e8 9b e2 ff ff callq 1030 2d95: 85 c0 test %eax,%eax 2d97: 0f 84 66 22 00 00 je 5003 2d9d: ba 0b 00 00 00 mov $0xb,%edx 2da2: 48 8d 35 27 85 00 00 lea 0x8527(%rip),%rsi # b2d0 <_IO_stdin_used+0x2d0> 2da9: 4c 89 ef mov %r13,%rdi 2dac: e8 7f e2 ff ff callq 1030 2db1: 85 c0 test %eax,%eax 2db3: 0f 84 fc 21 00 00 je 4fb5 2db9: ba 0b 00 00 00 mov $0xb,%edx 2dbe: 48 8d 35 17 85 00 00 lea 0x8517(%rip),%rsi # b2dc <_IO_stdin_used+0x2dc> 2dc5: 4c 89 ef mov %r13,%rdi 2dc8: e8 63 e2 ff ff callq 1030 2dcd: 85 c0 test %eax,%eax 2dcf: 0f 84 92 21 00 00 je 4f67 2dd5: ba 0b 00 00 00 mov $0xb,%edx 2dda: 48 8d 35 07 85 00 00 lea 0x8507(%rip),%rsi # b2e8 <_IO_stdin_used+0x2e8> 2de1: 4c 89 ef mov %r13,%rdi 2de4: e8 47 e2 ff ff callq 1030 2de9: 85 c0 test %eax,%eax 2deb: 0f 84 28 21 00 00 je 4f19 2df1: ba 0b 00 00 00 mov $0xb,%edx 2df6: 48 8d 35 f7 84 00 00 lea 0x84f7(%rip),%rsi # b2f4 <_IO_stdin_used+0x2f4> 2dfd: 4c 89 ef mov %r13,%rdi 2e00: e8 2b e2 ff ff callq 1030 2e05: 85 c0 test %eax,%eax 2e07: 0f 84 be 20 00 00 je 4ecb 2e0d: ba 08 00 00 00 mov $0x8,%edx 2e12: 48 8d 35 d2 84 00 00 lea 0x84d2(%rip),%rsi # b2eb <_IO_stdin_used+0x2eb> 2e19: 4c 89 ef mov %r13,%rdi 2e1c: e8 0f e2 ff ff callq 1030 2e21: 85 c0 test %eax,%eax 2e23: 0f 84 64 20 00 00 je 4e8d 2e29: ba 08 00 00 00 mov $0x8,%edx 2e2e: 48 8d 35 c2 84 00 00 lea 0x84c2(%rip),%rsi # b2f7 <_IO_stdin_used+0x2f7> 2e35: 4c 89 ef mov %r13,%rdi 2e38: e8 f3 e1 ff ff callq 1030 2e3d: 85 c0 test %eax,%eax 2e3f: 0f 84 0a 20 00 00 je 4e4f 2e45: ba 09 00 00 00 mov $0x9,%edx 2e4a: 48 8d 35 68 84 00 00 lea 0x8468(%rip),%rsi # b2b9 <_IO_stdin_used+0x2b9> 2e51: 4c 89 ef mov %r13,%rdi 2e54: e8 d7 e1 ff ff callq 1030 2e59: 85 c0 test %eax,%eax 2e5b: 0f 84 b0 1f 00 00 je 4e11 2e61: ba 09 00 00 00 mov $0x9,%edx 2e66: 48 8d 35 68 85 00 00 lea 0x8568(%rip),%rsi # b3d5 <_IO_stdin_used+0x3d5> 2e6d: 4c 89 ef mov %r13,%rdi 2e70: e8 bb e1 ff ff callq 1030 2e75: 85 c0 test %eax,%eax 2e77: 0f 84 56 1f 00 00 je 4dd3 2e7d: ba 06 00 00 00 mov $0x6,%edx 2e82: 48 8d 35 7a 84 00 00 lea 0x847a(%rip),%rsi # b303 <_IO_stdin_used+0x303> 2e89: 4c 89 ef mov %r13,%rdi 2e8c: e8 9f e1 ff ff callq 1030 2e91: 85 c0 test %eax,%eax 2e93: 0f 84 fa 1e 00 00 je 4d93 2e99: ba 06 00 00 00 mov $0x6,%edx 2e9e: 48 8d 35 0c 83 00 00 lea 0x830c(%rip),%rsi # b1b1 <_IO_stdin_used+0x1b1> 2ea5: 4c 89 ef mov %r13,%rdi 2ea8: e8 83 e1 ff ff callq 1030 2ead: 85 c0 test %eax,%eax 2eaf: 0f 84 9e 1e 00 00 je 4d53 2eb5: ba 09 00 00 00 mov $0x9,%edx 2eba: 48 8d 35 3f 84 00 00 lea 0x843f(%rip),%rsi # b300 <_IO_stdin_used+0x300> 2ec1: 4c 89 ef mov %r13,%rdi 2ec4: e8 67 e1 ff ff callq 1030 2ec9: 85 c0 test %eax,%eax 2ecb: 0f 84 34 1e 00 00 je 4d05 2ed1: ba 09 00 00 00 mov $0x9,%edx 2ed6: 48 8d 35 2d 84 00 00 lea 0x842d(%rip),%rsi # b30a <_IO_stdin_used+0x30a> 2edd: 4c 89 ef mov %r13,%rdi 2ee0: e8 4b e1 ff ff callq 1030 2ee5: 85 c0 test %eax,%eax 2ee7: 0f 84 ca 1d 00 00 je 4cb7 2eed: ba 06 00 00 00 mov $0x6,%edx 2ef2: 48 8d 35 29 84 00 00 lea 0x8429(%rip),%rsi # b322 <_IO_stdin_used+0x322> 2ef9: 4c 89 ef mov %r13,%rdi 2efc: e8 2f e1 ff ff callq 1030 2f01: 85 c0 test %eax,%eax 2f03: 0f 84 6e 1d 00 00 je 4c77 2f09: ba 06 00 00 00 mov $0x6,%edx 2f0e: 48 8d 35 ff 83 00 00 lea 0x83ff(%rip),%rsi # b314 <_IO_stdin_used+0x314> 2f15: 4c 89 ef mov %r13,%rdi 2f18: e8 13 e1 ff ff callq 1030 2f1d: 85 c0 test %eax,%eax 2f1f: 0f 84 12 1d 00 00 je 4c37 2f25: ba 0c 00 00 00 mov $0xc,%edx 2f2a: 48 8d 35 eb 83 00 00 lea 0x83eb(%rip),%rsi # b31c <_IO_stdin_used+0x31c> 2f31: 4c 89 ef mov %r13,%rdi 2f34: e8 f7 e0 ff ff callq 1030 2f39: 85 c0 test %eax,%eax 2f3b: 0f 84 b2 1c 00 00 je 4bf3 2f41: ba 0b 00 00 00 mov $0xb,%edx 2f46: 48 8d 35 dd 83 00 00 lea 0x83dd(%rip),%rsi # b32a <_IO_stdin_used+0x32a> 2f4d: 4c 89 ef mov %r13,%rdi 2f50: e8 db e0 ff ff callq 1030 2f55: 85 c0 test %eax,%eax 2f57: 0f 84 52 1c 00 00 je 4baf 2f5d: ba 0e 00 00 00 mov $0xe,%edx 2f62: 48 8d 35 ce 83 00 00 lea 0x83ce(%rip),%rsi # b337 <_IO_stdin_used+0x337> 2f69: 4c 89 ef mov %r13,%rdi 2f6c: e8 bf e0 ff ff callq 1030 2f71: 85 c0 test %eax,%eax 2f73: 0f 84 f2 1b 00 00 je 4b6b 2f79: ba 0b 00 00 00 mov $0xb,%edx 2f7e: 48 8d 35 c2 83 00 00 lea 0x83c2(%rip),%rsi # b347 <_IO_stdin_used+0x347> 2f85: 4c 89 ef mov %r13,%rdi 2f88: e8 a3 e0 ff ff callq 1030 2f8d: 85 c0 test %eax,%eax 2f8f: 0f 84 92 1b 00 00 je 4b27 2f95: ba 0d 00 00 00 mov $0xd,%edx 2f9a: 48 8d 35 76 81 00 00 lea 0x8176(%rip),%rsi # b117 <_IO_stdin_used+0x117> 2fa1: 4c 89 ef mov %r13,%rdi 2fa4: e8 87 e0 ff ff callq 1030 2fa9: 85 c0 test %eax,%eax 2fab: 0f 84 32 1b 00 00 je 4ae3 2fb1: ba 0f 00 00 00 mov $0xf,%edx 2fb6: 48 8d 35 97 83 00 00 lea 0x8397(%rip),%rsi # b354 <_IO_stdin_used+0x354> 2fbd: 4c 89 ef mov %r13,%rdi 2fc0: e8 6b e0 ff ff callq 1030 2fc5: 85 c0 test %eax,%eax 2fc7: 0f 84 d6 1a 00 00 je 4aa3 2fcd: ba 08 00 00 00 mov $0x8,%edx 2fd2: 48 8d 35 8c 83 00 00 lea 0x838c(%rip),%rsi # b365 <_IO_stdin_used+0x365> 2fd9: 4c 89 ef mov %r13,%rdi 2fdc: e8 4f e0 ff ff callq 1030 2fe1: 85 c0 test %eax,%eax 2fe3: 0f 84 6c 1a 00 00 je 4a55 2fe9: ba 08 00 00 00 mov $0x8,%edx 2fee: 48 8d 35 79 83 00 00 lea 0x8379(%rip),%rsi # b36e <_IO_stdin_used+0x36e> 2ff5: 4c 89 ef mov %r13,%rdi 2ff8: e8 33 e0 ff ff callq 1030 2ffd: 85 c0 test %eax,%eax 2fff: 0f 84 02 1a 00 00 je 4a07 3005: ba 05 00 00 00 mov $0x5,%edx 300a: 48 8d 35 57 83 00 00 lea 0x8357(%rip),%rsi # b368 <_IO_stdin_used+0x368> 3011: 4c 89 ef mov %r13,%rdi 3014: e8 17 e0 ff ff callq 1030 3019: 85 c0 test %eax,%eax 301b: 0f 84 a6 19 00 00 je 49c7 3021: ba 05 00 00 00 mov $0x5,%edx 3026: 48 8d 35 62 83 00 00 lea 0x8362(%rip),%rsi # b38f <_IO_stdin_used+0x38f> 302d: 4c 89 ef mov %r13,%rdi 3030: e8 fb df ff ff callq 1030 3035: 85 c0 test %eax,%eax 3037: 0f 84 4a 19 00 00 je 4987 303d: ba 05 00 00 00 mov $0x5,%edx 3042: 48 8d 35 2e 83 00 00 lea 0x832e(%rip),%rsi # b377 <_IO_stdin_used+0x377> 3049: 4c 89 ef mov %r13,%rdi 304c: e8 df df ff ff callq 1030 3051: 85 c0 test %eax,%eax 3053: 0f 84 ee 18 00 00 je 4947 3059: ba 05 00 00 00 mov $0x5,%edx 305e: 48 8d 35 20 83 00 00 lea 0x8320(%rip),%rsi # b385 <_IO_stdin_used+0x385> 3065: 4c 89 ef mov %r13,%rdi 3068: e8 c3 df ff ff callq 1030 306d: 85 c0 test %eax,%eax 306f: 0f 84 92 18 00 00 je 4907 3075: ba 07 00 00 00 mov $0x7,%edx 307a: 48 8d 35 1a 83 00 00 lea 0x831a(%rip),%rsi # b39b <_IO_stdin_used+0x39b> 3081: 4c 89 ef mov %r13,%rdi 3084: e8 a7 df ff ff callq 1030 3089: 85 c0 test %eax,%eax 308b: 0f 84 36 18 00 00 je 48c7 3091: ba 0d 00 00 00 mov $0xd,%edx 3096: 48 8d 35 f8 82 00 00 lea 0x82f8(%rip),%rsi # b395 <_IO_stdin_used+0x395> 309d: 4c 89 ef mov %r13,%rdi 30a0: e8 8b df ff ff callq 1030 30a5: 85 c0 test %eax,%eax 30a7: 0f 84 da 17 00 00 je 4887 30ad: ba 07 00 00 00 mov $0x7,%edx 30b2: 48 8d 35 ea 82 00 00 lea 0x82ea(%rip),%rsi # b3a3 <_IO_stdin_used+0x3a3> 30b9: 4c 89 ef mov %r13,%rdi 30bc: e8 6f df ff ff callq 1030 30c1: 85 c0 test %eax,%eax 30c3: 0f 84 7e 17 00 00 je 4847 30c9: ba 0d 00 00 00 mov $0xd,%edx 30ce: 48 8d 35 d6 82 00 00 lea 0x82d6(%rip),%rsi # b3ab <_IO_stdin_used+0x3ab> 30d5: 4c 89 ef mov %r13,%rdi 30d8: e8 53 df ff ff callq 1030 30dd: 85 c0 test %eax,%eax 30df: 0f 84 22 17 00 00 je 4807 30e5: ba 07 00 00 00 mov $0x7,%edx 30ea: 48 8d 35 cc 82 00 00 lea 0x82cc(%rip),%rsi # b3bd <_IO_stdin_used+0x3bd> 30f1: 4c 89 ef mov %r13,%rdi 30f4: e8 37 df ff ff callq 1030 30f9: 85 c0 test %eax,%eax 30fb: 0f 85 a9 f7 ff ff jne 28aa 3101: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3107: 48 8d 35 b2 78 00 00 lea 0x78b2(%rip),%rsi # a9c0 310e: bf 00 2f 68 59 mov $0x59682f00,%edi 3113: e8 28 79 00 00 callq aa40 3118: bf 01 00 00 00 mov $0x1,%edi 311d: b8 01 00 00 00 mov $0x1,%eax 3122: 48 8d 35 d4 82 00 00 lea 0x82d4(%rip),%rsi # b3fd <_IO_stdin_used+0x3fd> 3129: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 312d: e8 4e df ff ff callq 1080 <__printf_chk@plt> 3132: 41 83 fc 01 cmp $0x1,%r12d 3136: 0f 8f 6a f7 ff ff jg 28a6 313c: e9 05 f8 ff ff jmpq 2946 3141: 48 8b 0d 98 af 00 00 mov 0xaf98(%rip),%rcx # e0e0 3148: ba 11 00 00 00 mov $0x11,%edx 314d: be 01 00 00 00 mov $0x1,%esi 3152: 48 8d 3d f2 7e 00 00 lea 0x7ef2(%rip),%rdi # b04b <_IO_stdin_used+0x4b> 3159: e8 32 df ff ff callq 1090 315e: 41 83 fc 01 cmp $0x1,%r12d 3162: 0f 84 48 33 00 00 je 64b0 3168: f2 0f 10 05 10 8f 00 movsd 0x8f10(%rip),%xmm0 # c080 <_IO_stdin_used+0x1080> 316f: 00 3170: bf 01 00 00 00 mov $0x1,%edi 3175: b8 01 00 00 00 mov $0x1,%eax 317a: 48 8d 35 f7 85 00 00 lea 0x85f7(%rip),%rsi # b778 <_IO_stdin_used+0x778> 3181: e8 fa de ff ff callq 1080 <__printf_chk@plt> 3186: 41 83 fc 01 cmp $0x1,%r12d 318a: 0f 8e b6 f7 ff ff jle 2946 3190: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3194: ba 06 00 00 00 mov $0x6,%edx 3199: 48 8d 35 70 7f 00 00 lea 0x7f70(%rip),%rsi # b110 <_IO_stdin_used+0x110> 31a0: 4c 89 ef mov %r13,%rdi 31a3: e8 88 de ff ff callq 1030 31a8: 85 c0 test %eax,%eax 31aa: 75 37 jne 31e3 31ac: 48 8d 35 44 5a 00 00 lea 0x5a44(%rip),%rsi # 8bf7 31b3: f3 0f 10 05 cd 8e 00 movss 0x8ecd(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 31ba: 00 31bb: bf 00 2f 68 59 mov $0x59682f00,%edi 31c0: e8 7b 78 00 00 callq aa40 31c5: bf 01 00 00 00 mov $0x1,%edi 31ca: b8 01 00 00 00 mov $0x1,%eax 31cf: 48 8d 35 87 7e 00 00 lea 0x7e87(%rip),%rsi # b05d <_IO_stdin_used+0x5d> 31d6: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 31da: e8 a1 de ff ff callq 1080 <__printf_chk@plt> 31df: 4c 8b 6d 08 mov 0x8(%rbp),%r13 31e3: ba 09 00 00 00 mov $0x9,%edx 31e8: 48 8d 35 0e 7f 00 00 lea 0x7f0e(%rip),%rsi # b0fd <_IO_stdin_used+0xfd> 31ef: 4c 89 ef mov %r13,%rdi 31f2: e8 39 de ff ff callq 1030 31f7: 85 c0 test %eax,%eax 31f9: 0f 84 63 15 00 00 je 4762 31ff: ba 0f 00 00 00 mov $0xf,%edx 3204: 48 8d 35 fc 7e 00 00 lea 0x7efc(%rip),%rsi # b107 <_IO_stdin_used+0x107> 320b: 4c 89 ef mov %r13,%rdi 320e: e8 1d de ff ff callq 1030 3213: 85 c0 test %eax,%eax 3215: 0f 85 01 32 00 00 jne 641c 321b: 48 8d 35 b6 5a 00 00 lea 0x5ab6(%rip),%rsi # 8cd8 3222: f3 0f 10 05 5e 8e 00 movss 0x8e5e(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3229: 00 322a: bf 00 2f 68 59 mov $0x59682f00,%edi 322f: e8 0c 78 00 00 callq aa40 3234: bf 01 00 00 00 mov $0x1,%edi 3239: b8 01 00 00 00 mov $0x1,%eax 323e: 48 8d 35 83 85 00 00 lea 0x8583(%rip),%rsi # b7c8 <_IO_stdin_used+0x7c8> 3245: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3249: e8 32 de ff ff callq 1080 <__printf_chk@plt> 324e: 41 83 fc 01 cmp $0x1,%r12d 3252: 0f 8e ee f6 ff ff jle 2946 3258: 4c 8b 6d 08 mov 0x8(%rbp),%r13 325c: ba 0d 00 00 00 mov $0xd,%edx 3261: 48 8d 35 af 7e 00 00 lea 0x7eaf(%rip),%rsi # b117 <_IO_stdin_used+0x117> 3268: 4c 89 ef mov %r13,%rdi 326b: e8 c0 dd ff ff callq 1030 3270: 85 c0 test %eax,%eax 3272: 0f 85 4a 31 00 00 jne 63c2 3278: 48 8d 35 4a 62 00 00 lea 0x624a(%rip),%rsi # 94c9 327f: f3 0f 10 05 01 8e 00 movss 0x8e01(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3286: 00 3287: bf 05 7a d7 03 mov $0x3d77a05,%edi 328c: 48 c1 e7 09 shl $0x9,%rdi 3290: e8 ab 77 00 00 callq aa40 3295: bf 01 00 00 00 mov $0x1,%edi 329a: b8 01 00 00 00 mov $0x1,%eax 329f: 48 8d 35 4a 85 00 00 lea 0x854a(%rip),%rsi # b7f0 <_IO_stdin_used+0x7f0> 32a6: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 32aa: e8 d1 dd ff ff callq 1080 <__printf_chk@plt> 32af: 4c 8b 6d 08 mov 0x8(%rbp),%r13 32b3: ba 09 00 00 00 mov $0x9,%edx 32b8: 48 8d 35 69 7e 00 00 lea 0x7e69(%rip),%rsi # b128 <_IO_stdin_used+0x128> 32bf: 4c 89 ef mov %r13,%rdi 32c2: e8 69 dd ff ff callq 1030 32c7: 85 c0 test %eax,%eax 32c9: 0f 85 8b 30 00 00 jne 635a 32cf: 48 8d 35 b7 47 00 00 lea 0x47b7(%rip),%rsi # 7a8d 32d6: f3 0f 10 05 aa 8d 00 movss 0x8daa(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 32dd: 00 32de: bf 00 2f 68 59 mov $0x59682f00,%edi 32e3: e8 58 77 00 00 callq aa40 32e8: bf 01 00 00 00 mov $0x1,%edi 32ed: b8 01 00 00 00 mov $0x1,%eax 32f2: 48 8d 35 27 85 00 00 lea 0x8527(%rip),%rsi # b820 <_IO_stdin_used+0x820> 32f9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 32fd: e8 7e dd ff ff callq 1080 <__printf_chk@plt> 3302: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3306: ba 0c 00 00 00 mov $0xc,%edx 330b: 48 8d 35 13 7e 00 00 lea 0x7e13(%rip),%rsi # b125 <_IO_stdin_used+0x125> 3312: 4c 89 ef mov %r13,%rdi 3315: e8 16 dd ff ff callq 1030 331a: 85 c0 test %eax,%eax 331c: 0f 85 de 2f 00 00 jne 6300 3322: 48 8d 35 87 4b 00 00 lea 0x4b87(%rip),%rsi # 7eb0 3329: f3 0f 10 05 57 8d 00 movss 0x8d57(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3330: 00 3331: 4c 89 f7 mov %r14,%rdi 3334: e8 07 77 00 00 callq aa40 3339: bf 01 00 00 00 mov $0x1,%edi 333e: b8 01 00 00 00 mov $0x1,%eax 3343: f3 0f 10 0d 3d 8d 00 movss 0x8d3d(%rip),%xmm1 # c088 <_IO_stdin_used+0x1088> 334a: 00 334b: 48 8d 35 ee 84 00 00 lea 0x84ee(%rip),%rsi # b840 <_IO_stdin_used+0x840> 3352: f3 0f 5e c8 divss %xmm0,%xmm1 3356: 66 0f ef c0 pxor %xmm0,%xmm0 335a: f3 0f 5a c1 cvtss2sd %xmm1,%xmm0 335e: e8 1d dd ff ff callq 1080 <__printf_chk@plt> 3363: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3367: ba 09 00 00 00 mov $0x9,%edx 336c: 48 8d 35 d0 7d 00 00 lea 0x7dd0(%rip),%rsi # b143 <_IO_stdin_used+0x143> 3373: 4c 89 ef mov %r13,%rdi 3376: e8 b5 dc ff ff callq 1030 337b: 85 c0 test %eax,%eax 337d: 0f 85 1f 2f 00 00 jne 62a2 3383: 48 8d 35 87 45 00 00 lea 0x4587(%rip),%rsi # 7911 338a: f3 0f 10 05 f6 8c 00 movss 0x8cf6(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3391: 00 3392: bf 00 2f 68 59 mov $0x59682f00,%edi 3397: e8 a4 76 00 00 callq aa40 339c: bf 01 00 00 00 mov $0x1,%edi 33a1: b8 01 00 00 00 mov $0x1,%eax 33a6: 48 8d 35 bb 84 00 00 lea 0x84bb(%rip),%rsi # b868 <_IO_stdin_used+0x868> 33ad: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 33b1: e8 ca dc ff ff callq 1080 <__printf_chk@plt> 33b6: 4c 8b 6d 08 mov 0x8(%rbp),%r13 33ba: ba 09 00 00 00 mov $0x9,%edx 33bf: 48 8d 35 8a 7d 00 00 lea 0x7d8a(%rip),%rsi # b150 <_IO_stdin_used+0x150> 33c6: 4c 89 ef mov %r13,%rdi 33c9: e8 62 dc ff ff callq 1030 33ce: 85 c0 test %eax,%eax 33d0: 0f 85 5a 2e 00 00 jne 6230 33d6: 48 8d 35 f2 45 00 00 lea 0x45f2(%rip),%rsi # 79cf 33dd: f3 0f 10 05 a3 8c 00 movss 0x8ca3(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 33e4: 00 33e5: bf 00 2f 68 59 mov $0x59682f00,%edi 33ea: e8 51 76 00 00 callq aa40 33ef: bf 01 00 00 00 mov $0x1,%edi 33f4: b8 01 00 00 00 mov $0x1,%eax 33f9: 48 8d 35 90 84 00 00 lea 0x8490(%rip),%rsi # b890 <_IO_stdin_used+0x890> 3400: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3404: e8 77 dc ff ff callq 1080 <__printf_chk@plt> 3409: 4c 8b 6d 08 mov 0x8(%rbp),%r13 340d: ba 0d 00 00 00 mov $0xd,%edx 3412: 48 8d 35 19 7d 00 00 lea 0x7d19(%rip),%rsi # b132 <_IO_stdin_used+0x132> 3419: 4c 89 ef mov %r13,%rdi 341c: e8 0f dc ff ff callq 1030 3421: 85 c0 test %eax,%eax 3423: 0f 85 99 2d 00 00 jne 61c2 3429: 48 8d 35 99 4d 00 00 lea 0x4d99(%rip),%rsi # 81c9 3430: f3 0f 10 05 50 8c 00 movss 0x8c50(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3437: 00 3438: 4c 89 f7 mov %r14,%rdi 343b: e8 00 76 00 00 callq aa40 3440: bf 01 00 00 00 mov $0x1,%edi 3445: b8 01 00 00 00 mov $0x1,%eax 344a: f3 0f 10 3d 36 8c 00 movss 0x8c36(%rip),%xmm7 # c088 <_IO_stdin_used+0x1088> 3451: 00 3452: 48 8d 35 67 84 00 00 lea 0x8467(%rip),%rsi # b8c0 <_IO_stdin_used+0x8c0> 3459: f3 0f 11 7c 24 0c movss %xmm7,0xc(%rsp) 345f: f3 0f 5e f8 divss %xmm0,%xmm7 3463: 66 0f ef c0 pxor %xmm0,%xmm0 3467: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 346b: e8 10 dc ff ff callq 1080 <__printf_chk@plt> 3470: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3474: ba 0c 00 00 00 mov $0xc,%edx 3479: 48 8d 35 c0 7c 00 00 lea 0x7cc0(%rip),%rsi # b140 <_IO_stdin_used+0x140> 3480: 4c 89 ef mov %r13,%rdi 3483: e8 a8 db ff ff callq 1030 3488: 85 c0 test %eax,%eax 348a: 0f 85 ca 2c 00 00 jne 615a 3490: 48 8d 35 8e 4b 00 00 lea 0x4b8e(%rip),%rsi # 8025 3497: f3 0f 10 05 e9 8b 00 movss 0x8be9(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 349e: 00 349f: 4c 89 f7 mov %r14,%rdi 34a2: e8 99 75 00 00 callq aa40 34a7: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 34ad: bf 01 00 00 00 mov $0x1,%edi 34b2: 48 8d 35 37 84 00 00 lea 0x8437(%rip),%rsi # b8f0 <_IO_stdin_used+0x8f0> 34b9: b8 01 00 00 00 mov $0x1,%eax 34be: f3 0f 5e f8 divss %xmm0,%xmm7 34c2: 66 0f ef c0 pxor %xmm0,%xmm0 34c6: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 34ca: e8 b1 db ff ff callq 1080 <__printf_chk@plt> 34cf: 4c 8b 6d 08 mov 0x8(%rbp),%r13 34d3: ba 0d 00 00 00 mov $0xd,%edx 34d8: 48 8d 35 6e 7c 00 00 lea 0x7c6e(%rip),%rsi # b14d <_IO_stdin_used+0x14d> 34df: 4c 89 ef mov %r13,%rdi 34e2: e8 49 db ff ff callq 1030 34e7: 85 c0 test %eax,%eax 34e9: 0f 85 0d 2c 00 00 jne 60fc 34ef: 48 8d 35 01 4c 00 00 lea 0x4c01(%rip),%rsi # 80f7 34f6: f3 0f 10 05 8a 8b 00 movss 0x8b8a(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 34fd: 00 34fe: 4c 89 f7 mov %r14,%rdi 3501: e8 3a 75 00 00 callq aa40 3506: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 350c: bf 01 00 00 00 mov $0x1,%edi 3511: 48 8d 35 08 84 00 00 lea 0x8408(%rip),%rsi # b920 <_IO_stdin_used+0x920> 3518: b8 01 00 00 00 mov $0x1,%eax 351d: f3 0f 5e f0 divss %xmm0,%xmm6 3521: 66 0f ef c0 pxor %xmm0,%xmm0 3525: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 3529: e8 52 db ff ff callq 1080 <__printf_chk@plt> 352e: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3532: ba 0b 00 00 00 mov $0xb,%edx 3537: 48 8d 35 1e 7c 00 00 lea 0x7c1e(%rip),%rsi # b15c <_IO_stdin_used+0x15c> 353e: 4c 89 ef mov %r13,%rdi 3541: e8 ea da ff ff callq 1030 3546: 85 c0 test %eax,%eax 3548: 0f 85 50 2b 00 00 jne 609e 354e: 48 8d 35 cd 5a 00 00 lea 0x5acd(%rip),%rsi # 9022 3555: f3 0f 10 05 2b 8b 00 movss 0x8b2b(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 355c: 00 355d: bf 05 7a d7 03 mov $0x3d77a05,%edi 3562: 48 c1 e7 09 shl $0x9,%rdi 3566: e8 d5 74 00 00 callq aa40 356b: bf 01 00 00 00 mov $0x1,%edi 3570: b8 01 00 00 00 mov $0x1,%eax 3575: 48 8d 35 dc 83 00 00 lea 0x83dc(%rip),%rsi # b958 <_IO_stdin_used+0x958> 357c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3580: e8 fb da ff ff callq 1080 <__printf_chk@plt> 3585: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3589: ba 0b 00 00 00 mov $0xb,%edx 358e: 48 8d 35 d4 7b 00 00 lea 0x7bd4(%rip),%rsi # b169 <_IO_stdin_used+0x169> 3595: 4c 89 ef mov %r13,%rdi 3598: e8 93 da ff ff callq 1030 359d: 85 c0 test %eax,%eax 359f: 0f 85 9f 2a 00 00 jne 6044 35a5: 48 8d 35 57 5b 00 00 lea 0x5b57(%rip),%rsi # 9103 35ac: f3 0f 10 05 d4 8a 00 movss 0x8ad4(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 35b3: 00 35b4: bf 05 7a d7 03 mov $0x3d77a05,%edi 35b9: 48 c1 e7 09 shl $0x9,%rdi 35bd: e8 7e 74 00 00 callq aa40 35c2: bf 01 00 00 00 mov $0x1,%edi 35c7: b8 01 00 00 00 mov $0x1,%eax 35cc: 48 8d 35 ad 83 00 00 lea 0x83ad(%rip),%rsi # b980 <_IO_stdin_used+0x980> 35d3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 35d7: e8 a4 da ff ff callq 1080 <__printf_chk@plt> 35dc: 4c 8b 6d 08 mov 0x8(%rbp),%r13 35e0: ba 07 00 00 00 mov $0x7,%edx 35e5: 48 8d 35 8d 7b 00 00 lea 0x7b8d(%rip),%rsi # b179 <_IO_stdin_used+0x179> 35ec: 4c 89 ef mov %r13,%rdi 35ef: e8 3c da ff ff callq 1030 35f4: 85 c0 test %eax,%eax 35f6: 0f 85 ee 29 00 00 jne 5fea 35fc: f3 0f 10 05 84 8a 00 movss 0x8a84(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3603: 00 3604: 48 8d 35 a5 73 00 00 lea 0x73a5(%rip),%rsi # a9b0 360b: bf 00 2f 68 59 mov $0x59682f00,%edi 3610: e8 2b 74 00 00 callq aa40 3615: bf 01 00 00 00 mov $0x1,%edi 361a: b8 01 00 00 00 mov $0x1,%eax 361f: 48 8d 35 52 7a 00 00 lea 0x7a52(%rip),%rsi # b078 <_IO_stdin_used+0x78> 3626: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 362a: e8 51 da ff ff callq 1080 <__printf_chk@plt> 362f: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3633: ba 07 00 00 00 mov $0x7,%edx 3638: 48 8d 35 42 7b 00 00 lea 0x7b42(%rip),%rsi # b181 <_IO_stdin_used+0x181> 363f: 4c 89 ef mov %r13,%rdi 3642: e8 e9 d9 ff ff callq 1030 3647: 85 c0 test %eax,%eax 3649: 0f 85 41 29 00 00 jne 5f90 364f: f3 0f 10 05 31 8a 00 movss 0x8a31(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3656: 00 3657: 48 8d 35 a2 73 00 00 lea 0x73a2(%rip),%rsi # aa00 365e: bf 00 2f 68 59 mov $0x59682f00,%edi 3663: e8 d8 73 00 00 callq aa40 3668: bf 01 00 00 00 mov $0x1,%edi 366d: b8 01 00 00 00 mov $0x1,%eax 3672: 48 8d 35 1c 7a 00 00 lea 0x7a1c(%rip),%rsi # b095 <_IO_stdin_used+0x95> 3679: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 367d: e8 fe d9 ff ff callq 1080 <__printf_chk@plt> 3682: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3686: ba 09 00 00 00 mov $0x9,%edx 368b: 48 8d 35 f8 7a 00 00 lea 0x7af8(%rip),%rsi # b18a <_IO_stdin_used+0x18a> 3692: 4c 89 ef mov %r13,%rdi 3695: e8 96 d9 ff ff callq 1030 369a: 85 c0 test %eax,%eax 369c: 0f 85 94 28 00 00 jne 5f36 36a2: 48 8d 35 76 4d 00 00 lea 0x4d76(%rip),%rsi # 841f 36a9: f3 0f 10 05 d7 89 00 movss 0x89d7(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 36b0: 00 36b1: bf 00 2f 68 59 mov $0x59682f00,%edi 36b6: e8 85 73 00 00 callq aa40 36bb: bf 01 00 00 00 mov $0x1,%edi 36c0: b8 01 00 00 00 mov $0x1,%eax 36c5: 48 8d 35 e7 79 00 00 lea 0x79e7(%rip),%rsi # b0b3 <_IO_stdin_used+0xb3> 36cc: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 36d0: e8 ab d9 ff ff callq 1080 <__printf_chk@plt> 36d5: 4c 8b 6d 08 mov 0x8(%rbp),%r13 36d9: ba 09 00 00 00 mov $0x9,%edx 36de: 48 8d 35 af 7a 00 00 lea 0x7aaf(%rip),%rsi # b194 <_IO_stdin_used+0x194> 36e5: 4c 89 ef mov %r13,%rdi 36e8: e8 43 d9 ff ff callq 1030 36ed: 85 c0 test %eax,%eax 36ef: 0f 85 e7 27 00 00 jne 5edc 36f5: 48 8d 35 87 50 00 00 lea 0x5087(%rip),%rsi # 8783 36fc: f3 0f 10 05 84 89 00 movss 0x8984(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3703: 00 3704: bf 00 2f 68 59 mov $0x59682f00,%edi 3709: e8 32 73 00 00 callq aa40 370e: bf 01 00 00 00 mov $0x1,%edi 3713: b8 01 00 00 00 mov $0x1,%eax 3718: 48 8d 35 aa 79 00 00 lea 0x79aa(%rip),%rsi # b0c9 <_IO_stdin_used+0xc9> 371f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3723: e8 58 d9 ff ff callq 1080 <__printf_chk@plt> 3728: 4c 8b 6d 08 mov 0x8(%rbp),%r13 372c: ba 0c 00 00 00 mov $0xc,%edx 3731: 48 8d 35 66 7a 00 00 lea 0x7a66(%rip),%rsi # b19e <_IO_stdin_used+0x19e> 3738: 4c 89 ef mov %r13,%rdi 373b: e8 f0 d8 ff ff callq 1030 3740: 85 c0 test %eax,%eax 3742: 0f 85 3a 27 00 00 jne 5e82 3748: 48 8d 35 6a 4d 00 00 lea 0x4d6a(%rip),%rsi # 84b9 374f: f3 0f 10 05 31 89 00 movss 0x8931(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 3756: 00 3757: bf 00 2f 68 59 mov $0x59682f00,%edi 375c: e8 df 72 00 00 callq aa40 3761: bf 01 00 00 00 mov $0x1,%edi 3766: b8 01 00 00 00 mov $0x1,%eax 376b: 48 8d 35 3e 82 00 00 lea 0x823e(%rip),%rsi # b9b0 <_IO_stdin_used+0x9b0> 3772: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3776: e8 05 d9 ff ff callq 1080 <__printf_chk@plt> 377b: 4c 8b 6d 08 mov 0x8(%rbp),%r13 377f: ba 0c 00 00 00 mov $0xc,%edx 3784: 48 8d 35 20 7a 00 00 lea 0x7a20(%rip),%rsi # b1ab <_IO_stdin_used+0x1ab> 378b: 4c 89 ef mov %r13,%rdi 378e: e8 9d d8 ff ff callq 1030 3793: 85 c0 test %eax,%eax 3795: 0f 85 89 26 00 00 jne 5e24 379b: 48 8d 35 d1 4d 00 00 lea 0x4dd1(%rip),%rsi # 8573 37a2: f3 0f 10 05 de 88 00 movss 0x88de(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 37a9: 00 37aa: bf 00 2f 68 59 mov $0x59682f00,%edi 37af: e8 8c 72 00 00 callq aa40 37b4: bf 01 00 00 00 mov $0x1,%edi 37b9: b8 01 00 00 00 mov $0x1,%eax 37be: 48 8d 35 1a 79 00 00 lea 0x791a(%rip),%rsi # b0df <_IO_stdin_used+0xdf> 37c5: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 37c9: e8 b2 d8 ff ff callq 1080 <__printf_chk@plt> 37ce: 4c 8b 6d 08 mov 0x8(%rbp),%r13 37d2: ba 0c 00 00 00 mov $0xc,%edx 37d7: 48 8d 35 da 79 00 00 lea 0x79da(%rip),%rsi # b1b8 <_IO_stdin_used+0x1b8> 37de: 4c 89 ef mov %r13,%rdi 37e1: e8 4a d8 ff ff callq 1030 37e6: 85 c0 test %eax,%eax 37e8: 0f 85 7d f1 ff ff jne 296b 37ee: 48 8d 35 f8 4e 00 00 lea 0x4ef8(%rip),%rsi # 86ed 37f5: f3 0f 10 05 8b 88 00 movss 0x888b(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 37fc: 00 37fd: bf 00 2f 68 59 mov $0x59682f00,%edi 3802: e8 39 72 00 00 callq aa40 3807: bf 01 00 00 00 mov $0x1,%edi 380c: b8 01 00 00 00 mov $0x1,%eax 3811: 48 8d 35 b8 81 00 00 lea 0x81b8(%rip),%rsi # b9d0 <_IO_stdin_used+0x9d0> 3818: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 381c: e8 5f d8 ff ff callq 1080 <__printf_chk@plt> 3821: 4c 8b 6d 08 mov 0x8(%rbp),%r13 3825: ba 0c 00 00 00 mov $0xc,%edx 382a: 48 8d 35 87 79 00 00 lea 0x7987(%rip),%rsi # b1b8 <_IO_stdin_used+0x1b8> 3831: 4c 89 ef mov %r13,%rdi 3834: e8 f7 d7 ff ff callq 1030 3839: 85 c0 test %eax,%eax 383b: 0f 85 ce 2c 00 00 jne 650f 3841: f3 0f 10 35 3f 88 00 movss 0x883f(%rip),%xmm6 # c088 <_IO_stdin_used+0x1088> 3848: 00 3849: f3 0f 11 74 24 0c movss %xmm6,0xc(%rsp) 384f: f3 0f 11 74 24 08 movss %xmm6,0x8(%rsp) 3855: 48 8d 35 d8 4d 00 00 lea 0x4dd8(%rip),%rsi # 8634 385c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3862: bf 00 2f 68 59 mov $0x59682f00,%edi 3867: e8 d4 71 00 00 callq aa40 386c: bf 01 00 00 00 mov $0x1,%edi 3871: b8 01 00 00 00 mov $0x1,%eax 3876: 48 8d 35 7b 81 00 00 lea 0x817b(%rip),%rsi # b9f8 <_IO_stdin_used+0x9f8> 387d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3881: e8 fa d7 ff ff callq 1080 <__printf_chk@plt> 3886: 41 83 fc 01 cmp $0x1,%r12d 388a: 0f 85 3e d9 ff ff jne 11ce 3890: 48 8d 35 e1 34 00 00 lea 0x34e1(%rip),%rsi # 6d78 3897: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 389d: 4c 89 f7 mov %r14,%rdi 38a0: e8 9b 71 00 00 callq aa40 38a5: bf 01 00 00 00 mov $0x1,%edi 38aa: b8 01 00 00 00 mov $0x1,%eax 38af: 48 8d 35 96 7e 00 00 lea 0x7e96(%rip),%rsi # b74c <_IO_stdin_used+0x74c> 38b6: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 38ba: e8 c1 d7 ff ff callq 1080 <__printf_chk@plt> 38bf: 48 8d 35 79 34 00 00 lea 0x3479(%rip),%rsi # 6d3f 38c6: 4c 89 f7 mov %r14,%rdi 38c9: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 38cf: e8 6c 71 00 00 callq aa40 38d4: bf 01 00 00 00 mov $0x1,%edi 38d9: b8 01 00 00 00 mov $0x1,%eax 38de: 48 8d 35 4c 7e 00 00 lea 0x7e4c(%rip),%rsi # b731 <_IO_stdin_used+0x731> 38e5: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 38e9: e8 92 d7 ff ff callq 1080 <__printf_chk@plt> 38ee: 48 8d 35 a9 34 00 00 lea 0x34a9(%rip),%rsi # 6d9e 38f5: 4c 89 f7 mov %r14,%rdi 38f8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 38fe: e8 3d 71 00 00 callq aa40 3903: bf 01 00 00 00 mov $0x1,%edi 3908: b8 01 00 00 00 mov $0x1,%eax 390d: 48 8d 35 09 7e 00 00 lea 0x7e09(%rip),%rsi # b71d <_IO_stdin_used+0x71d> 3914: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3918: e8 63 d7 ff ff callq 1080 <__printf_chk@plt> 391d: 48 8d 2d 06 35 00 00 lea 0x3506(%rip),%rbp # 6e2a 3924: 4c 89 f7 mov %r14,%rdi 3927: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 392d: 48 89 ee mov %rbp,%rsi 3930: e8 0b 71 00 00 callq aa40 3935: bf 01 00 00 00 mov $0x1,%edi 393a: b8 01 00 00 00 mov $0x1,%eax 393f: 48 8d 35 ba 7d 00 00 lea 0x7dba(%rip),%rsi # b700 <_IO_stdin_used+0x700> 3946: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 394a: e8 31 d7 ff ff callq 1080 <__printf_chk@plt> 394f: 48 89 ee mov %rbp,%rsi 3952: 4c 89 f7 mov %r14,%rdi 3955: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 395b: e8 e0 70 00 00 callq aa40 3960: bf 01 00 00 00 mov $0x1,%edi 3965: b8 01 00 00 00 mov $0x1,%eax 396a: 48 8d 35 72 7d 00 00 lea 0x7d72(%rip),%rsi # b6e3 <_IO_stdin_used+0x6e3> 3971: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3975: e8 06 d7 ff ff callq 1080 <__printf_chk@plt> 397a: 48 8d 35 c6 6b 00 00 lea 0x6bc6(%rip),%rsi # a547 3981: 4c 89 f7 mov %r14,%rdi 3984: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 398a: e8 b1 70 00 00 callq aa40 398f: bf 01 00 00 00 mov $0x1,%edi 3994: b8 01 00 00 00 mov $0x1,%eax 3999: 48 8d 35 25 7d 00 00 lea 0x7d25(%rip),%rsi # b6c5 <_IO_stdin_used+0x6c5> 39a0: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 39a4: e8 d7 d6 ff ff callq 1080 <__printf_chk@plt> 39a9: 48 8d 35 0c 6c 00 00 lea 0x6c0c(%rip),%rsi # a5bc 39b0: 4c 89 f7 mov %r14,%rdi 39b3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 39b9: e8 82 70 00 00 callq aa40 39be: bf 01 00 00 00 mov $0x1,%edi 39c3: b8 01 00 00 00 mov $0x1,%eax 39c8: 48 8d 35 59 86 00 00 lea 0x8659(%rip),%rsi # c028 <_IO_stdin_used+0x1028> 39cf: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 39d3: e8 a8 d6 ff ff callq 1080 <__printf_chk@plt> 39d8: 48 8d 35 1f 6d 00 00 lea 0x6d1f(%rip),%rsi # a6fe 39df: 4c 89 f7 mov %r14,%rdi 39e2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 39e8: e8 53 70 00 00 callq aa40 39ed: bf 01 00 00 00 mov $0x1,%edi 39f2: b8 01 00 00 00 mov $0x1,%eax 39f7: 48 8d 35 af 7c 00 00 lea 0x7caf(%rip),%rsi # b6ad <_IO_stdin_used+0x6ad> 39fe: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3a02: e8 79 d6 ff ff callq 1080 <__printf_chk@plt> 3a07: 48 8d 35 25 6c 00 00 lea 0x6c25(%rip),%rsi # a633 3a0e: 4c 89 f7 mov %r14,%rdi 3a11: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3a17: e8 24 70 00 00 callq aa40 3a1c: bf 01 00 00 00 mov $0x1,%edi 3a21: b8 01 00 00 00 mov $0x1,%eax 3a26: 48 8d 35 68 7c 00 00 lea 0x7c68(%rip),%rsi # b695 <_IO_stdin_used+0x695> 3a2d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3a31: e8 4a d6 ff ff callq 1080 <__printf_chk@plt> 3a36: 48 8d 35 38 6d 00 00 lea 0x6d38(%rip),%rsi # a775 3a3d: 4c 89 f7 mov %r14,%rdi 3a40: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3a46: e8 f5 6f 00 00 callq aa40 3a4b: bf 01 00 00 00 mov $0x1,%edi 3a50: b8 01 00 00 00 mov $0x1,%eax 3a55: 48 8d 35 21 7c 00 00 lea 0x7c21(%rip),%rsi # b67d <_IO_stdin_used+0x67d> 3a5c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3a60: e8 1b d6 ff ff callq 1080 <__printf_chk@plt> 3a65: 48 8d 35 0b 6e 00 00 lea 0x6e0b(%rip),%rsi # a877 3a6c: 4c 89 f7 mov %r14,%rdi 3a6f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3a75: e8 c6 6f 00 00 callq aa40 3a7a: bf 01 00 00 00 mov $0x1,%edi 3a7f: b8 01 00 00 00 mov $0x1,%eax 3a84: 48 8d 35 db 7b 00 00 lea 0x7bdb(%rip),%rsi # b666 <_IO_stdin_used+0x666> 3a8b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3a8f: e8 ec d5 ff ff callq 1080 <__printf_chk@plt> 3a94: 48 8d 35 53 6e 00 00 lea 0x6e53(%rip),%rsi # a8ee 3a9b: 4c 89 f7 mov %r14,%rdi 3a9e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3aa4: e8 97 6f 00 00 callq aa40 3aa9: bf 01 00 00 00 mov $0x1,%edi 3aae: b8 01 00 00 00 mov $0x1,%eax 3ab3: 48 8d 35 95 7b 00 00 lea 0x7b95(%rip),%rsi # b64f <_IO_stdin_used+0x64f> 3aba: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3abe: e8 bd d5 ff ff callq 1080 <__printf_chk@plt> 3ac3: 48 8d 35 22 6d 00 00 lea 0x6d22(%rip),%rsi # a7ec 3aca: 4c 89 f7 mov %r14,%rdi 3acd: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3ad3: e8 68 6f 00 00 callq aa40 3ad8: bf 01 00 00 00 mov $0x1,%edi 3add: b8 01 00 00 00 mov $0x1,%eax 3ae2: 48 8d 35 17 85 00 00 lea 0x8517(%rip),%rsi # c000 <_IO_stdin_used+0x1000> 3ae9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3aed: e8 8e d5 ff ff callq 1080 <__printf_chk@plt> 3af2: 48 8d 35 ad 31 00 00 lea 0x31ad(%rip),%rsi # 6ca6 3af9: 4c 89 f7 mov %r14,%rdi 3afc: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3b02: e8 39 6f 00 00 callq aa40 3b07: bf 01 00 00 00 mov $0x1,%edi 3b0c: b8 01 00 00 00 mov $0x1,%eax 3b11: 48 8d 35 c0 84 00 00 lea 0x84c0(%rip),%rsi # bfd8 <_IO_stdin_used+0xfd8> 3b18: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3b1c: e8 5f d5 ff ff callq 1080 <__printf_chk@plt> 3b21: 48 8d 35 7c 3c 00 00 lea 0x3c7c(%rip),%rsi # 77a4 3b28: 4c 89 f7 mov %r14,%rdi 3b2b: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3b31: e8 0a 6f 00 00 callq aa40 3b36: bf 01 00 00 00 mov $0x1,%edi 3b3b: b8 01 00 00 00 mov $0x1,%eax 3b40: 48 8d 35 eb 7a 00 00 lea 0x7aeb(%rip),%rsi # b632 <_IO_stdin_used+0x632> 3b47: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3b4b: e8 30 d5 ff ff callq 1080 <__printf_chk@plt> 3b50: 48 8d 35 90 3b 00 00 lea 0x3b90(%rip),%rsi # 76e7 3b57: 4c 89 f7 mov %r14,%rdi 3b5a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3b60: e8 db 6e 00 00 callq aa40 3b65: bf 01 00 00 00 mov $0x1,%edi 3b6a: b8 01 00 00 00 mov $0x1,%eax 3b6f: 48 8d 35 a1 7a 00 00 lea 0x7aa1(%rip),%rsi # b617 <_IO_stdin_used+0x617> 3b76: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3b7a: e8 01 d5 ff ff callq 1080 <__printf_chk@plt> 3b7f: 48 8d 35 6b 39 00 00 lea 0x396b(%rip),%rsi # 74f1 3b86: 4c 89 f7 mov %r14,%rdi 3b89: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3b8f: e8 ac 6e 00 00 callq aa40 3b94: bf 01 00 00 00 mov $0x1,%edi 3b99: b8 01 00 00 00 mov $0x1,%eax 3b9e: 48 8d 35 59 7a 00 00 lea 0x7a59(%rip),%rsi # b5fe <_IO_stdin_used+0x5fe> 3ba5: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3ba9: e8 d2 d4 ff ff callq 1080 <__printf_chk@plt> 3bae: 48 8d 35 14 3a 00 00 lea 0x3a14(%rip),%rsi # 75c9 3bb5: 4c 89 f7 mov %r14,%rdi 3bb8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3bbe: e8 7d 6e 00 00 callq aa40 3bc3: bf 01 00 00 00 mov $0x1,%edi 3bc8: b8 01 00 00 00 mov $0x1,%eax 3bcd: 48 8d 35 14 7a 00 00 lea 0x7a14(%rip),%rsi # b5e8 <_IO_stdin_used+0x5e8> 3bd4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3bd8: e8 a3 d4 ff ff callq 1080 <__printf_chk@plt> 3bdd: 48 8d 35 4c 67 00 00 lea 0x674c(%rip),%rsi # a330 3be4: 4c 89 f7 mov %r14,%rdi 3be7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3bed: e8 4e 6e 00 00 callq aa40 3bf2: bf 01 00 00 00 mov $0x1,%edi 3bf7: b8 01 00 00 00 mov $0x1,%eax 3bfc: 48 8d 35 d1 79 00 00 lea 0x79d1(%rip),%rsi # b5d4 <_IO_stdin_used+0x5d4> 3c03: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3c07: e8 74 d4 ff ff callq 1080 <__printf_chk@plt> 3c0c: 48 8d 35 80 68 00 00 lea 0x6880(%rip),%rsi # a493 3c13: 4c 89 f7 mov %r14,%rdi 3c16: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3c1c: e8 1f 6e 00 00 callq aa40 3c21: bf 01 00 00 00 mov $0x1,%edi 3c26: b8 01 00 00 00 mov $0x1,%eax 3c2b: 48 8d 35 8e 79 00 00 lea 0x798e(%rip),%rsi # b5c0 <_IO_stdin_used+0x5c0> 3c32: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3c36: e8 45 d4 ff ff callq 1080 <__printf_chk@plt> 3c3b: 48 8d 35 a2 67 00 00 lea 0x67a2(%rip),%rsi # a3e4 3c42: 4c 89 f7 mov %r14,%rdi 3c45: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3c4b: e8 f0 6d 00 00 callq aa40 3c50: bf 01 00 00 00 mov $0x1,%edi 3c55: b8 01 00 00 00 mov $0x1,%eax 3c5a: 48 8d 35 43 79 00 00 lea 0x7943(%rip),%rsi # b5a4 <_IO_stdin_used+0x5a4> 3c61: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3c65: e8 16 d4 ff ff callq 1080 <__printf_chk@plt> 3c6a: 48 8d 35 55 33 00 00 lea 0x3355(%rip),%rsi # 6fc6 3c71: 4c 89 f7 mov %r14,%rdi 3c74: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3c7a: e8 c1 6d 00 00 callq aa40 3c7f: bf 01 00 00 00 mov $0x1,%edi 3c84: b8 01 00 00 00 mov $0x1,%eax 3c89: 48 8d 35 fd 78 00 00 lea 0x78fd(%rip),%rsi # b58d <_IO_stdin_used+0x58d> 3c90: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3c94: e8 e7 d3 ff ff callq 1080 <__printf_chk@plt> 3c99: 48 8d 35 9a 32 00 00 lea 0x329a(%rip),%rsi # 6f3a 3ca0: 4c 89 f7 mov %r14,%rdi 3ca3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3ca9: e8 92 6d 00 00 callq aa40 3cae: bf 01 00 00 00 mov $0x1,%edi 3cb3: b8 01 00 00 00 mov $0x1,%eax 3cb8: 48 8d 35 b7 78 00 00 lea 0x78b7(%rip),%rsi # b576 <_IO_stdin_used+0x576> 3cbf: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3cc3: e8 b8 d3 ff ff callq 1080 <__printf_chk@plt> 3cc8: 48 8d 35 83 33 00 00 lea 0x3383(%rip),%rsi # 7052 3ccf: 4c 89 f7 mov %r14,%rdi 3cd2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3cd8: e8 63 6d 00 00 callq aa40 3cdd: bf 01 00 00 00 mov $0x1,%edi 3ce2: b8 01 00 00 00 mov $0x1,%eax 3ce7: 48 8d 35 ca 82 00 00 lea 0x82ca(%rip),%rsi # bfb8 <_IO_stdin_used+0xfb8> 3cee: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3cf2: e8 89 d3 ff ff callq 1080 <__printf_chk@plt> 3cf7: 48 8d 35 e0 33 00 00 lea 0x33e0(%rip),%rsi # 70de 3cfe: 4c 89 f7 mov %r14,%rdi 3d01: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3d07: e8 34 6d 00 00 callq aa40 3d0c: bf 01 00 00 00 mov $0x1,%edi 3d11: b8 01 00 00 00 mov $0x1,%eax 3d16: 48 8d 35 3e 78 00 00 lea 0x783e(%rip),%rsi # b55b <_IO_stdin_used+0x55b> 3d1d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3d21: e8 5a d3 ff ff callq 1080 <__printf_chk@plt> 3d26: 48 8d 35 03 35 00 00 lea 0x3503(%rip),%rsi # 7230 3d2d: 4c 89 f7 mov %r14,%rdi 3d30: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3d36: e8 05 6d 00 00 callq aa40 3d3b: bf 01 00 00 00 mov $0x1,%edi 3d40: b8 01 00 00 00 mov $0x1,%eax 3d45: 48 8d 35 fc 77 00 00 lea 0x77fc(%rip),%rsi # b548 <_IO_stdin_used+0x548> 3d4c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3d50: e8 2b d3 ff ff callq 1080 <__printf_chk@plt> 3d55: 48 8d 35 cd 36 00 00 lea 0x36cd(%rip),%rsi # 7429 3d5c: 4c 89 f7 mov %r14,%rdi 3d5f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3d65: e8 d6 6c 00 00 callq aa40 3d6a: bf 01 00 00 00 mov $0x1,%edi 3d6f: b8 01 00 00 00 mov $0x1,%eax 3d74: 48 8d 35 b2 77 00 00 lea 0x77b2(%rip),%rsi # b52d <_IO_stdin_used+0x52d> 3d7b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3d7f: e8 fc d2 ff ff callq 1080 <__printf_chk@plt> 3d84: 48 8d 35 08 34 00 00 lea 0x3408(%rip),%rsi # 7193 3d8b: 4c 89 f7 mov %r14,%rdi 3d8e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3d94: e8 a7 6c 00 00 callq aa40 3d99: bf 01 00 00 00 mov $0x1,%edi 3d9e: b8 01 00 00 00 mov $0x1,%eax 3da3: 48 8d 35 68 77 00 00 lea 0x7768(%rip),%rsi # b512 <_IO_stdin_used+0x512> 3daa: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3dae: e8 cd d2 ff ff callq 1080 <__printf_chk@plt> 3db3: 48 8d 35 19 35 00 00 lea 0x3519(%rip),%rsi # 72d3 3dba: 4c 89 f7 mov %r14,%rdi 3dbd: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3dc3: e8 78 6c 00 00 callq aa40 3dc8: bf 01 00 00 00 mov $0x1,%edi 3dcd: b8 01 00 00 00 mov $0x1,%eax 3dd2: 48 8d 35 20 77 00 00 lea 0x7720(%rip),%rsi # b4f9 <_IO_stdin_used+0x4f9> 3dd9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3ddd: e8 9e d2 ff ff callq 1080 <__printf_chk@plt> 3de2: 48 8d 35 8d 35 00 00 lea 0x358d(%rip),%rsi # 7376 3de9: 4c 89 f7 mov %r14,%rdi 3dec: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3df2: e8 49 6c 00 00 callq aa40 3df7: bf 01 00 00 00 mov $0x1,%edi 3dfc: b8 01 00 00 00 mov $0x1,%eax 3e01: 48 8d 35 88 81 00 00 lea 0x8188(%rip),%rsi # bf90 <_IO_stdin_used+0xf90> 3e08: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3e0c: e8 6f d2 ff ff callq 1080 <__printf_chk@plt> 3e11: 48 8d 35 73 3a 00 00 lea 0x3a73(%rip),%rsi # 788b 3e18: 4c 89 f7 mov %r14,%rdi 3e1b: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3e21: e8 1a 6c 00 00 callq aa40 3e26: bf 01 00 00 00 mov $0x1,%edi 3e2b: b8 01 00 00 00 mov $0x1,%eax 3e30: 48 8d 35 31 81 00 00 lea 0x8131(%rip),%rsi # bf68 <_IO_stdin_used+0xf68> 3e37: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3e3b: e8 40 d2 ff ff callq 1080 <__printf_chk@plt> 3e40: 48 8d 35 fd 3e 00 00 lea 0x3efd(%rip),%rsi # 7d44 3e47: 4c 89 f7 mov %r14,%rdi 3e4a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3e50: e8 eb 6b 00 00 callq aa40 3e55: bf 01 00 00 00 mov $0x1,%edi 3e5a: b8 01 00 00 00 mov $0x1,%eax 3e5f: 48 8d 35 ca 80 00 00 lea 0x80ca(%rip),%rsi # bf30 <_IO_stdin_used+0xf30> 3e66: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3e6a: e8 11 d2 ff ff callq 1080 <__printf_chk@plt> 3e6f: 48 8d 35 93 3f 00 00 lea 0x3f93(%rip),%rsi # 7e09 3e76: 4c 89 f7 mov %r14,%rdi 3e79: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3e7f: e8 bc 6b 00 00 callq aa40 3e84: bf 01 00 00 00 mov $0x1,%edi 3e89: b8 01 00 00 00 mov $0x1,%eax 3e8e: 48 8d 35 63 80 00 00 lea 0x8063(%rip),%rsi # bef8 <_IO_stdin_used+0xef8> 3e95: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3e99: e8 e2 d1 ff ff callq 1080 <__printf_chk@plt> 3e9e: 48 8d 35 a6 3c 00 00 lea 0x3ca6(%rip),%rsi # 7b4b 3ea5: 4c 89 f7 mov %r14,%rdi 3ea8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3eae: e8 8d 6b 00 00 callq aa40 3eb3: bf 01 00 00 00 mov $0x1,%edi 3eb8: b8 01 00 00 00 mov $0x1,%eax 3ebd: 48 8d 35 04 80 00 00 lea 0x8004(%rip),%rsi # bec8 <_IO_stdin_used+0xec8> 3ec4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3ec8: e8 b3 d1 ff ff callq 1080 <__printf_chk@plt> 3ecd: 48 8d 35 cc 3d 00 00 lea 0x3dcc(%rip),%rsi # 7ca0 3ed4: 4c 89 f7 mov %r14,%rdi 3ed7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3edd: e8 5e 6b 00 00 callq aa40 3ee2: bf 01 00 00 00 mov $0x1,%edi 3ee7: b8 01 00 00 00 mov $0x1,%eax 3eec: 48 8d 35 a5 7f 00 00 lea 0x7fa5(%rip),%rsi # be98 <_IO_stdin_used+0xe98> 3ef3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 3ef7: e8 84 d1 ff ff callq 1080 <__printf_chk@plt> 3efc: 48 8d 35 ad 3f 00 00 lea 0x3fad(%rip),%rsi # 7eb0 3f03: 4c 89 f7 mov %r14,%rdi 3f06: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3f0c: e8 2f 6b 00 00 callq aa40 3f11: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 3f17: bf 01 00 00 00 mov $0x1,%edi 3f1c: 48 8d 35 45 7f 00 00 lea 0x7f45(%rip),%rsi # be68 <_IO_stdin_used+0xe68> 3f23: b8 01 00 00 00 mov $0x1,%eax 3f28: f3 0f 5e e8 divss %xmm0,%xmm5 3f2c: 66 0f ef c0 pxor %xmm0,%xmm0 3f30: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 3f34: e8 47 d1 ff ff callq 1080 <__printf_chk@plt> 3f39: 48 8d 35 5b 43 00 00 lea 0x435b(%rip),%rsi # 829b 3f40: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3f46: bf 00 2f 68 59 mov $0x59682f00,%edi 3f4b: e8 f0 6a 00 00 callq aa40 3f50: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 3f56: bf 01 00 00 00 mov $0x1,%edi 3f5b: 48 8d 35 ce 7e 00 00 lea 0x7ece(%rip),%rsi # be30 <_IO_stdin_used+0xe30> 3f62: b8 01 00 00 00 mov $0x1,%eax 3f67: f3 0f 5e f0 divss %xmm0,%xmm6 3f6b: 66 0f ef c0 pxor %xmm0,%xmm0 3f6f: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 3f73: e8 08 d1 ff ff callq 1080 <__printf_chk@plt> 3f78: 48 8d 35 ca 43 00 00 lea 0x43ca(%rip),%rsi # 8349 3f7f: 4c 89 f7 mov %r14,%rdi 3f82: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3f88: e8 b3 6a 00 00 callq aa40 3f8d: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 3f93: bf 01 00 00 00 mov $0x1,%edi 3f98: 48 8d 35 61 7e 00 00 lea 0x7e61(%rip),%rsi # be00 <_IO_stdin_used+0xe00> 3f9f: b8 01 00 00 00 mov $0x1,%eax 3fa4: f3 0f 5e f8 divss %xmm0,%xmm7 3fa8: 66 0f ef c0 pxor %xmm0,%xmm0 3fac: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 3fb0: e8 cb d0 ff ff callq 1080 <__printf_chk@plt> 3fb5: 48 8d 35 e0 48 00 00 lea 0x48e0(%rip),%rsi # 889c 3fbc: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 3fc2: bf 00 2f 68 59 mov $0x59682f00,%edi 3fc7: e8 74 6a 00 00 callq aa40 3fcc: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 3fd2: bf 01 00 00 00 mov $0x1,%edi 3fd7: 48 8d 35 ea 7d 00 00 lea 0x7dea(%rip),%rsi # bdc8 <_IO_stdin_used+0xdc8> 3fde: b8 01 00 00 00 mov $0x1,%eax 3fe3: f3 0f 5e e8 divss %xmm0,%xmm5 3fe7: 66 0f ef c0 pxor %xmm0,%xmm0 3feb: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 3fef: e8 8c d0 ff ff callq 1080 <__printf_chk@plt> 3ff4: 48 8d 35 c6 49 00 00 lea 0x49c6(%rip),%rsi # 89c1 3ffb: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4001: bf 00 2f 68 59 mov $0x59682f00,%edi 4006: e8 35 6a 00 00 callq aa40 400b: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 4011: bf 01 00 00 00 mov $0x1,%edi 4016: 48 8d 35 83 7d 00 00 lea 0x7d83(%rip),%rsi # bda0 <_IO_stdin_used+0xda0> 401d: b8 01 00 00 00 mov $0x1,%eax 4022: f3 0f 5e f8 divss %xmm0,%xmm7 4026: 66 0f ef c0 pxor %xmm0,%xmm0 402a: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 402e: e8 4d d0 ff ff callq 1080 <__printf_chk@plt> 4033: 48 8d 35 41 4b 00 00 lea 0x4b41(%rip),%rsi # 8b7b 403a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4040: bf 00 2f 68 59 mov $0x59682f00,%edi 4045: e8 f6 69 00 00 callq aa40 404a: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 4050: bf 01 00 00 00 mov $0x1,%edi 4055: 48 8d 35 1c 7d 00 00 lea 0x7d1c(%rip),%rsi # bd78 <_IO_stdin_used+0xd78> 405c: b8 01 00 00 00 mov $0x1,%eax 4061: f3 0f 5e f0 divss %xmm0,%xmm6 4065: 66 0f ef c0 pxor %xmm0,%xmm0 4069: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 406d: e8 0e d0 ff ff callq 1080 <__printf_chk@plt> 4072: 48 8d 35 13 58 00 00 lea 0x5813(%rip),%rsi # 988c 4079: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 407f: bf 00 2f 68 59 mov $0x59682f00,%edi 4084: e8 b7 69 00 00 callq aa40 4089: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 408f: bf 01 00 00 00 mov $0x1,%edi 4094: 48 8d 35 b5 7c 00 00 lea 0x7cb5(%rip),%rsi # bd50 <_IO_stdin_used+0xd50> 409b: b8 01 00 00 00 mov $0x1,%eax 40a0: f3 0f 5e e8 divss %xmm0,%xmm5 40a4: 66 0f ef c0 pxor %xmm0,%xmm0 40a8: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 40ac: e8 cf cf ff ff callq 1080 <__printf_chk@plt> 40b1: 48 8d 35 34 58 00 00 lea 0x5834(%rip),%rsi # 98ec 40b8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 40be: bf 00 2f 68 59 mov $0x59682f00,%edi 40c3: e8 78 69 00 00 callq aa40 40c8: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 40ce: bf 01 00 00 00 mov $0x1,%edi 40d3: 48 8d 35 4e 7c 00 00 lea 0x7c4e(%rip),%rsi # bd28 <_IO_stdin_used+0xd28> 40da: b8 01 00 00 00 mov $0x1,%eax 40df: f3 0f 5e f0 divss %xmm0,%xmm6 40e3: 66 0f ef c0 pxor %xmm0,%xmm0 40e7: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 40eb: e8 90 cf ff ff callq 1080 <__printf_chk@plt> 40f0: 48 8d 35 c4 58 00 00 lea 0x58c4(%rip),%rsi # 99bb 40f7: 4c 89 f7 mov %r14,%rdi 40fa: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4100: e8 3b 69 00 00 callq aa40 4105: bf 01 00 00 00 mov $0x1,%edi 410a: b8 01 00 00 00 mov $0x1,%eax 410f: 48 8d 35 f2 7b 00 00 lea 0x7bf2(%rip),%rsi # bd08 <_IO_stdin_used+0xd08> 4116: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 411a: e8 61 cf ff ff callq 1080 <__printf_chk@plt> 411f: 48 8d 35 26 58 00 00 lea 0x5826(%rip),%rsi # 994c 4126: 4c 89 f7 mov %r14,%rdi 4129: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 412f: e8 0c 69 00 00 callq aa40 4134: bf 01 00 00 00 mov $0x1,%edi 4139: b8 01 00 00 00 mov $0x1,%eax 413e: 48 8d 35 a3 7b 00 00 lea 0x7ba3(%rip),%rsi # bce8 <_IO_stdin_used+0xce8> 4145: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4149: e8 32 cf ff ff callq 1080 <__printf_chk@plt> 414e: 48 8d 35 5f 42 00 00 lea 0x425f(%rip),%rsi # 83b4 4155: 4c 89 f7 mov %r14,%rdi 4158: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 415e: e8 dd 68 00 00 callq aa40 4163: bf 01 00 00 00 mov $0x1,%edi 4168: b8 01 00 00 00 mov $0x1,%eax 416d: 48 8d 35 4c 7b 00 00 lea 0x7b4c(%rip),%rsi # bcc0 <_IO_stdin_used+0xcc0> 4174: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4178: e8 03 cf ff ff callq 1080 <__printf_chk@plt> 417d: 48 8d 35 99 46 00 00 lea 0x4699(%rip),%rsi # 881d 4184: 4c 89 f7 mov %r14,%rdi 4187: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 418d: e8 ae 68 00 00 callq aa40 4192: bf 01 00 00 00 mov $0x1,%edi 4197: b8 01 00 00 00 mov $0x1,%eax 419c: 48 8d 35 f5 7a 00 00 lea 0x7af5(%rip),%rsi # bc98 <_IO_stdin_used+0xc98> 41a3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 41a7: e8 d4 ce ff ff callq 1080 <__printf_chk@plt> 41ac: 48 8d 35 fc 4b 00 00 lea 0x4bfc(%rip),%rsi # 8daf 41b3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 41b9: bf 00 2f 68 59 mov $0x59682f00,%edi 41be: e8 7d 68 00 00 callq aa40 41c3: bf 01 00 00 00 mov $0x1,%edi 41c8: b8 01 00 00 00 mov $0x1,%eax 41cd: 48 8d 35 0a 73 00 00 lea 0x730a(%rip),%rsi # b4de <_IO_stdin_used+0x4de> 41d4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 41d8: e8 a3 ce ff ff callq 1080 <__printf_chk@plt> 41dd: 48 8d 35 83 4c 00 00 lea 0x4c83(%rip),%rsi # 8e67 41e4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 41ea: bf 00 2f 68 59 mov $0x59682f00,%edi 41ef: e8 4c 68 00 00 callq aa40 41f4: bf 01 00 00 00 mov $0x1,%edi 41f9: b8 01 00 00 00 mov $0x1,%eax 41fe: 48 8d 35 be 72 00 00 lea 0x72be(%rip),%rsi # b4c3 <_IO_stdin_used+0x4c3> 4205: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4209: e8 72 ce ff ff callq 1080 <__printf_chk@plt> 420e: 48 8d 35 04 55 00 00 lea 0x5504(%rip),%rsi # 9719 4215: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 421b: bf 00 2f 68 59 mov $0x59682f00,%edi 4220: e8 1b 68 00 00 callq aa40 4225: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 422b: bf 01 00 00 00 mov $0x1,%edi 4230: 48 8d 35 39 7a 00 00 lea 0x7a39(%rip),%rsi # bc70 <_IO_stdin_used+0xc70> 4237: b8 01 00 00 00 mov $0x1,%eax 423c: f3 0f 5e f8 divss %xmm0,%xmm7 4240: 66 0f ef c0 pxor %xmm0,%xmm0 4244: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 4248: e8 33 ce ff ff callq 1080 <__printf_chk@plt> 424d: 48 8d 35 7d 55 00 00 lea 0x557d(%rip),%rsi # 97d1 4254: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 425a: bf 00 2f 68 59 mov $0x59682f00,%edi 425f: e8 dc 67 00 00 callq aa40 4264: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 426a: bf 01 00 00 00 mov $0x1,%edi 426f: 48 8d 35 d2 79 00 00 lea 0x79d2(%rip),%rsi # bc48 <_IO_stdin_used+0xc48> 4276: b8 01 00 00 00 mov $0x1,%eax 427b: f3 0f 5e e8 divss %xmm0,%xmm5 427f: 66 0f ef c0 pxor %xmm0,%xmm0 4283: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 4287: e8 f4 cd ff ff callq 1080 <__printf_chk@plt> 428c: 48 8d 35 49 48 00 00 lea 0x4849(%rip),%rsi # 8adc 4293: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4299: bf 00 2f 68 59 mov $0x59682f00,%edi 429e: e8 9d 67 00 00 callq aa40 42a3: bf 01 00 00 00 mov $0x1,%edi 42a8: b8 01 00 00 00 mov $0x1,%eax 42ad: 48 8d 35 f3 71 00 00 lea 0x71f3(%rip),%rsi # b4a7 <_IO_stdin_used+0x4a7> 42b4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 42b8: e8 c3 cd ff ff callq 1080 <__printf_chk@plt> 42bd: 48 8d 35 79 47 00 00 lea 0x4779(%rip),%rsi # 8a3d 42c4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 42ca: bf 00 2f 68 59 mov $0x59682f00,%edi 42cf: e8 6c 67 00 00 callq aa40 42d4: bf 01 00 00 00 mov $0x1,%edi 42d9: b8 01 00 00 00 mov $0x1,%eax 42de: 48 8d 35 a6 71 00 00 lea 0x71a6(%rip),%rsi # b48b <_IO_stdin_used+0x48b> 42e5: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 42e9: e8 92 cd ff ff callq 1080 <__printf_chk@plt> 42ee: 48 8d 35 2d 4c 00 00 lea 0x4c2d(%rip),%rsi # 8f22 42f5: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 42fb: bf 05 7a d7 03 mov $0x3d77a05,%edi 4300: 48 c1 e7 09 shl $0x9,%rdi 4304: e8 37 67 00 00 callq aa40 4309: bf 01 00 00 00 mov $0x1,%edi 430e: b8 01 00 00 00 mov $0x1,%eax 4313: 48 8d 35 06 79 00 00 lea 0x7906(%rip),%rsi # bc20 <_IO_stdin_used+0xc20> 431a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 431e: e8 5d cd ff ff callq 1080 <__printf_chk@plt> 4323: 48 8d 35 a7 4e 00 00 lea 0x4ea7(%rip),%rsi # 91d1 432a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4330: bf 05 7a d7 03 mov $0x3d77a05,%edi 4335: 48 c1 e7 09 shl $0x9,%rdi 4339: e8 02 67 00 00 callq aa40 433e: bf 01 00 00 00 mov $0x1,%edi 4343: b8 01 00 00 00 mov $0x1,%eax 4348: 48 8d 35 a9 78 00 00 lea 0x78a9(%rip),%rsi # bbf8 <_IO_stdin_used+0xbf8> 434f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4353: e8 28 cd ff ff callq 1080 <__printf_chk@plt> 4358: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 435e: bf 05 7a d7 03 mov $0x3d77a05,%edi 4363: 48 c1 e7 09 shl $0x9,%rdi 4367: 48 8d 35 c2 66 00 00 lea 0x66c2(%rip),%rsi # aa30 436e: e8 cd 66 00 00 callq aa40 4373: bf 01 00 00 00 mov $0x1,%edi 4378: b8 01 00 00 00 mov $0x1,%eax 437d: 48 8d 35 44 78 00 00 lea 0x7844(%rip),%rsi # bbc8 <_IO_stdin_used+0xbc8> 4384: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4388: e8 f3 cc ff ff callq 1080 <__printf_chk@plt> 438d: 48 8d 35 ee 4e 00 00 lea 0x4eee(%rip),%rsi # 9282 4394: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 439a: bf 05 7a d7 03 mov $0x3d77a05,%edi 439f: 48 c1 e7 09 shl $0x9,%rdi 43a3: e8 98 66 00 00 callq aa40 43a8: bf 01 00 00 00 mov $0x1,%edi 43ad: b8 01 00 00 00 mov $0x1,%eax 43b2: 48 8d 35 e7 77 00 00 lea 0x77e7(%rip),%rsi # bba0 <_IO_stdin_used+0xba0> 43b9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 43bd: e8 be cc ff ff callq 1080 <__printf_chk@plt> 43c2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 43c8: bf 05 7a d7 03 mov $0x3d77a05,%edi 43cd: 48 c1 e7 09 shl $0x9,%rdi 43d1: 48 8d 35 48 66 00 00 lea 0x6648(%rip),%rsi # aa20 43d8: e8 63 66 00 00 callq aa40 43dd: bf 01 00 00 00 mov $0x1,%edi 43e2: b8 01 00 00 00 mov $0x1,%eax 43e7: 48 8d 35 5a 7c 00 00 lea 0x7c5a(%rip),%rsi # c048 <_IO_stdin_used+0x1048> 43ee: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 43f2: e8 89 cc ff ff callq 1080 <__printf_chk@plt> 43f7: 48 8d 35 9a 51 00 00 lea 0x519a(%rip),%rsi # 9598 43fe: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4404: bf 05 7a d7 03 mov $0x3d77a05,%edi 4409: 48 c1 e7 09 shl $0x9,%rdi 440d: e8 2e 66 00 00 callq aa40 4412: bf 01 00 00 00 mov $0x1,%edi 4417: b8 01 00 00 00 mov $0x1,%eax 441c: 48 8d 35 4d 77 00 00 lea 0x774d(%rip),%rsi # bb70 <_IO_stdin_used+0xb70> 4423: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4427: e8 54 cc ff ff callq 1080 <__printf_chk@plt> 442c: 48 8d 35 c1 37 00 00 lea 0x37c1(%rip),%rsi # 7bf4 4433: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4439: bf 00 2f 68 59 mov $0x59682f00,%edi 443e: e8 fd 65 00 00 callq aa40 4443: bf 01 00 00 00 mov $0x1,%edi 4448: b8 01 00 00 00 mov $0x1,%eax 444d: 48 8d 35 f4 76 00 00 lea 0x76f4(%rip),%rsi # bb48 <_IO_stdin_used+0xb48> 4454: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4458: e8 23 cc ff ff callq 1080 <__printf_chk@plt> 445d: 48 8d 35 66 56 00 00 lea 0x5666(%rip),%rsi # 9aca 4464: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 446a: bf 00 2f 68 59 mov $0x59682f00,%edi 446f: e8 cc 65 00 00 callq aa40 4474: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 447a: bf 01 00 00 00 mov $0x1,%edi 447f: 48 8d 35 9a 76 00 00 lea 0x769a(%rip),%rsi # bb20 <_IO_stdin_used+0xb20> 4486: b8 01 00 00 00 mov $0x1,%eax 448b: f3 0f 5e f8 divss %xmm0,%xmm7 448f: 66 0f ef c0 pxor %xmm0,%xmm0 4493: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 4497: e8 e4 cb ff ff callq 1080 <__printf_chk@plt> 449c: 48 8d 35 87 55 00 00 lea 0x5587(%rip),%rsi # 9a2a 44a3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 44a9: bf 00 2f 68 59 mov $0x59682f00,%edi 44ae: e8 8d 65 00 00 callq aa40 44b3: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 44b9: bf 01 00 00 00 mov $0x1,%edi 44be: 48 8d 35 33 76 00 00 lea 0x7633(%rip),%rsi # baf8 <_IO_stdin_used+0xaf8> 44c5: b8 01 00 00 00 mov $0x1,%eax 44ca: f3 0f 5e e8 divss %xmm0,%xmm5 44ce: 66 0f ef c0 pxor %xmm0,%xmm0 44d2: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 44d6: e8 a5 cb ff ff callq 1080 <__printf_chk@plt> 44db: 48 8d 35 9c 56 00 00 lea 0x569c(%rip),%rsi # 9b7e 44e2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 44e8: bf 00 2f 68 59 mov $0x59682f00,%edi 44ed: e8 4e 65 00 00 callq aa40 44f2: bf 01 00 00 00 mov $0x1,%edi 44f7: b8 01 00 00 00 mov $0x1,%eax 44fc: 48 8d 35 6d 6f 00 00 lea 0x6f6d(%rip),%rsi # b470 <_IO_stdin_used+0x470> 4503: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4507: e8 74 cb ff ff callq 1080 <__printf_chk@plt> 450c: 48 8d 35 1f 57 00 00 lea 0x571f(%rip),%rsi # 9c32 4513: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4519: bf 00 2f 68 59 mov $0x59682f00,%edi 451e: e8 1d 65 00 00 callq aa40 4523: bf 01 00 00 00 mov $0x1,%edi 4528: b8 01 00 00 00 mov $0x1,%eax 452d: 48 8d 35 21 6f 00 00 lea 0x6f21(%rip),%rsi # b455 <_IO_stdin_used+0x455> 4534: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4538: e8 43 cb ff ff callq 1080 <__printf_chk@plt> 453d: 48 8d 35 d0 57 00 00 lea 0x57d0(%rip),%rsi # 9d14 4544: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 454a: bf 00 2f 68 59 mov $0x59682f00,%edi 454f: e8 ec 64 00 00 callq aa40 4554: bf 01 00 00 00 mov $0x1,%edi 4559: b8 01 00 00 00 mov $0x1,%eax 455e: 48 8d 35 63 75 00 00 lea 0x7563(%rip),%rsi # bac8 <_IO_stdin_used+0xac8> 4565: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4569: e8 12 cb ff ff callq 1080 <__printf_chk@plt> 456e: 48 8d 35 4b 58 00 00 lea 0x584b(%rip),%rsi # 9dc0 4575: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 457b: bf 00 2f 68 59 mov $0x59682f00,%edi 4580: e8 bb 64 00 00 callq aa40 4585: bf 01 00 00 00 mov $0x1,%edi 458a: b8 01 00 00 00 mov $0x1,%eax 458f: 48 8d 35 02 75 00 00 lea 0x7502(%rip),%rsi # ba98 <_IO_stdin_used+0xa98> 4596: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 459a: e8 e1 ca ff ff callq 1080 <__printf_chk@plt> 459f: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 45a5: 48 8d 35 c4 63 00 00 lea 0x63c4(%rip),%rsi # a970 45ac: bf 00 2f 68 59 mov $0x59682f00,%edi 45b1: e8 8a 64 00 00 callq aa40 45b6: bf 01 00 00 00 mov $0x1,%edi 45bb: b8 01 00 00 00 mov $0x1,%eax 45c0: 48 8d 35 71 6e 00 00 lea 0x6e71(%rip),%rsi # b438 <_IO_stdin_used+0x438> 45c7: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 45cb: e8 b0 ca ff ff callq 1080 <__printf_chk@plt> 45d0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 45d6: 48 8d 35 a3 63 00 00 lea 0x63a3(%rip),%rsi # a980 45dd: bf 00 2f 68 59 mov $0x59682f00,%edi 45e2: e8 59 64 00 00 callq aa40 45e7: bf 01 00 00 00 mov $0x1,%edi 45ec: b8 01 00 00 00 mov $0x1,%eax 45f1: 48 8d 35 78 74 00 00 lea 0x7478(%rip),%rsi # ba70 <_IO_stdin_used+0xa70> 45f8: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 45fc: e8 7f ca ff ff callq 1080 <__printf_chk@plt> 4601: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4607: 48 8d 35 92 63 00 00 lea 0x6392(%rip),%rsi # a9a0 460e: bf 00 2f 68 59 mov $0x59682f00,%edi 4613: e8 28 64 00 00 callq aa40 4618: bf 01 00 00 00 mov $0x1,%edi 461d: b8 01 00 00 00 mov $0x1,%eax 4622: 48 8d 35 f2 6d 00 00 lea 0x6df2(%rip),%rsi # b41b <_IO_stdin_used+0x41b> 4629: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 462d: e8 4e ca ff ff callq 1080 <__printf_chk@plt> 4632: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4638: 48 8d 35 51 63 00 00 lea 0x6351(%rip),%rsi # a990 463f: bf 00 2f 68 59 mov $0x59682f00,%edi 4644: e8 f7 63 00 00 callq aa40 4649: bf 01 00 00 00 mov $0x1,%edi 464e: b8 01 00 00 00 mov $0x1,%eax 4653: 48 8d 35 ee 73 00 00 lea 0x73ee(%rip),%rsi # ba48 <_IO_stdin_used+0xa48> 465a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 465e: e8 1d ca ff ff callq 1080 <__printf_chk@plt> 4663: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4669: 48 8d 35 50 63 00 00 lea 0x6350(%rip),%rsi # a9c0 4670: bf 00 2f 68 59 mov $0x59682f00,%edi 4675: e8 c6 63 00 00 callq aa40 467a: bf 01 00 00 00 mov $0x1,%edi 467f: b8 01 00 00 00 mov $0x1,%eax 4684: 48 8d 35 72 6d 00 00 lea 0x6d72(%rip),%rsi # b3fd <_IO_stdin_used+0x3fd> 468b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 468f: e8 ec c9 ff ff callq 1080 <__printf_chk@plt> 4694: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 469a: 48 8d 35 3f 63 00 00 lea 0x633f(%rip),%rsi # a9e0 46a1: bf 00 2f 68 59 mov $0x59682f00,%edi 46a6: e8 95 63 00 00 callq aa40 46ab: bf 01 00 00 00 mov $0x1,%edi 46b0: b8 01 00 00 00 mov $0x1,%eax 46b5: 48 8d 35 23 6d 00 00 lea 0x6d23(%rip),%rsi # b3df <_IO_stdin_used+0x3df> 46bc: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 46c0: e8 bb c9 ff ff callq 1080 <__printf_chk@plt> 46c5: e9 4b e2 ff ff jmpq 2915 46ca: 48 8b 0d 0f 9a 00 00 mov 0x9a0f(%rip),%rcx # e0e0 46d1: ba 0f 00 00 00 mov $0xf,%edx 46d6: be 01 00 00 00 mov $0x1,%esi 46db: 48 8d 3d 59 69 00 00 lea 0x6959(%rip),%rdi # b03b <_IO_stdin_used+0x3b> 46e2: e8 a9 c9 ff ff callq 1090 46e7: e9 91 ca ff ff jmpq 117d 46ec: 48 8b 0d ed 99 00 00 mov 0x99ed(%rip),%rcx # e0e0 46f3: ba 0f 00 00 00 mov $0xf,%edx 46f8: be 01 00 00 00 mov $0x1,%esi 46fd: 48 8d 3d 27 69 00 00 lea 0x6927(%rip),%rdi # b02b <_IO_stdin_used+0x2b> 4704: e8 87 c9 ff ff callq 1090 4709: e9 64 ca ff ff jmpq 1172 470e: 48 8b 0d cb 99 00 00 mov 0x99cb(%rip),%rcx # e0e0 4715: ba 0e 00 00 00 mov $0xe,%edx 471a: be 01 00 00 00 mov $0x1,%esi 471f: 48 8d 3d f6 68 00 00 lea 0x68f6(%rip),%rdi # b01c <_IO_stdin_used+0x1c> 4726: e8 65 c9 ff ff callq 1090 472b: e9 37 ca ff ff jmpq 1167 4730: 48 8b 7d 10 mov 0x10(%rbp),%rdi 4734: 31 f6 xor %esi,%esi 4736: ba 0a 00 00 00 mov $0xa,%edx 473b: e8 30 c9 ff ff callq 1070 4740: 48 8d 35 bd 68 00 00 lea 0x68bd(%rip),%rsi # b004 <_IO_stdin_used+0x4> 4747: bf 01 00 00 00 mov $0x1,%edi 474c: 4c 69 f0 00 2f 68 59 imul $0x59682f00,%rax,%r14 4753: 31 c0 xor %eax,%eax 4755: 4c 89 f2 mov %r14,%rdx 4758: e8 23 c9 ff ff callq 1080 <__printf_chk@plt> 475d: e9 f3 c9 ff ff jmpq 1155 4762: 48 8d 35 cf 4e 00 00 lea 0x4ecf(%rip),%rsi # 9638 4769: f3 0f 10 05 17 79 00 movss 0x7917(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 4770: 00 4771: bf 00 2f 68 59 mov $0x59682f00,%edi 4776: e8 c5 62 00 00 callq aa40 477b: bf 01 00 00 00 mov $0x1,%edi 4780: b8 01 00 00 00 mov $0x1,%eax 4785: f3 0f 10 2d fb 78 00 movss 0x78fb(%rip),%xmm5 # c088 <_IO_stdin_used+0x1088> 478c: 00 478d: 48 8d 35 0c 70 00 00 lea 0x700c(%rip),%rsi # b7a0 <_IO_stdin_used+0x7a0> 4794: f3 0f 5e e8 divss %xmm0,%xmm5 4798: 66 0f ef c0 pxor %xmm0,%xmm0 479c: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 47a0: e8 db c8 ff ff callq 1080 <__printf_chk@plt> 47a5: 41 83 fc 01 cmp $0x1,%r12d 47a9: 0f 8e 97 e1 ff ff jle 2946 47af: 4c 8b 6d 08 mov 0x8(%rbp),%r13 47b3: ba 0f 00 00 00 mov $0xf,%edx 47b8: 48 8d 35 48 69 00 00 lea 0x6948(%rip),%rsi # b107 <_IO_stdin_used+0x107> 47bf: 4c 89 ef mov %r13,%rdi 47c2: e8 69 c8 ff ff callq 1030 47c7: 85 c0 test %eax,%eax 47c9: 0f 85 4d 1c 00 00 jne 641c 47cf: 48 8d 35 02 45 00 00 lea 0x4502(%rip),%rsi # 8cd8 47d6: f3 0f 10 05 aa 78 00 movss 0x78aa(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 47dd: 00 47de: bf 00 2f 68 59 mov $0x59682f00,%edi 47e3: e8 58 62 00 00 callq aa40 47e8: bf 01 00 00 00 mov $0x1,%edi 47ed: b8 01 00 00 00 mov $0x1,%eax 47f2: 48 8d 35 cf 6f 00 00 lea 0x6fcf(%rip),%rsi # b7c8 <_IO_stdin_used+0x7c8> 47f9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 47fd: e8 7e c8 ff ff callq 1080 <__printf_chk@plt> 4802: e9 51 ea ff ff jmpq 3258 4807: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 480d: 48 8d 35 7c 61 00 00 lea 0x617c(%rip),%rsi # a990 4814: bf 00 2f 68 59 mov $0x59682f00,%edi 4819: e8 22 62 00 00 callq aa40 481e: bf 01 00 00 00 mov $0x1,%edi 4823: b8 01 00 00 00 mov $0x1,%eax 4828: 48 8d 35 19 72 00 00 lea 0x7219(%rip),%rsi # ba48 <_IO_stdin_used+0xa48> 482f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4833: e8 48 c8 ff ff callq 1080 <__printf_chk@plt> 4838: 41 83 fc 01 cmp $0x1,%r12d 483c: 0f 8f 17 e0 ff ff jg 2859 4842: e9 ff e0 ff ff jmpq 2946 4847: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 484d: 48 8d 35 4c 61 00 00 lea 0x614c(%rip),%rsi # a9a0 4854: bf 00 2f 68 59 mov $0x59682f00,%edi 4859: e8 e2 61 00 00 callq aa40 485e: bf 01 00 00 00 mov $0x1,%edi 4863: b8 01 00 00 00 mov $0x1,%eax 4868: 48 8d 35 ac 6b 00 00 lea 0x6bac(%rip),%rsi # b41b <_IO_stdin_used+0x41b> 486f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4873: e8 08 c8 ff ff callq 1080 <__printf_chk@plt> 4878: 41 83 fc 01 cmp $0x1,%r12d 487c: 0f 8f 86 df ff ff jg 2808 4882: e9 bf e0 ff ff jmpq 2946 4887: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 488d: 48 8d 35 ec 60 00 00 lea 0x60ec(%rip),%rsi # a980 4894: bf 00 2f 68 59 mov $0x59682f00,%edi 4899: e8 a2 61 00 00 callq aa40 489e: bf 01 00 00 00 mov $0x1,%edi 48a3: b8 01 00 00 00 mov $0x1,%eax 48a8: 48 8d 35 c1 71 00 00 lea 0x71c1(%rip),%rsi # ba70 <_IO_stdin_used+0xa70> 48af: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 48b3: e8 c8 c7 ff ff callq 1080 <__printf_chk@plt> 48b8: 41 83 fc 01 cmp $0x1,%r12d 48bc: 0f 8f f5 de ff ff jg 27b7 48c2: e9 7f e0 ff ff jmpq 2946 48c7: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 48cd: 48 8d 35 9c 60 00 00 lea 0x609c(%rip),%rsi # a970 48d4: bf 00 2f 68 59 mov $0x59682f00,%edi 48d9: e8 62 61 00 00 callq aa40 48de: bf 01 00 00 00 mov $0x1,%edi 48e3: b8 01 00 00 00 mov $0x1,%eax 48e8: 48 8d 35 49 6b 00 00 lea 0x6b49(%rip),%rsi # b438 <_IO_stdin_used+0x438> 48ef: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 48f3: e8 88 c7 ff ff callq 1080 <__printf_chk@plt> 48f8: 41 83 fc 01 cmp $0x1,%r12d 48fc: 0f 8f 64 de ff ff jg 2766 4902: e9 3f e0 ff ff jmpq 2946 4907: 48 8d 35 b2 54 00 00 lea 0x54b2(%rip),%rsi # 9dc0 490e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4914: bf 00 2f 68 59 mov $0x59682f00,%edi 4919: e8 22 61 00 00 callq aa40 491e: bf 01 00 00 00 mov $0x1,%edi 4923: b8 01 00 00 00 mov $0x1,%eax 4928: 48 8d 35 69 71 00 00 lea 0x7169(%rip),%rsi # ba98 <_IO_stdin_used+0xa98> 492f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4933: e8 48 c7 ff ff callq 1080 <__printf_chk@plt> 4938: 41 83 fc 01 cmp $0x1,%r12d 493c: 0f 8f d3 dd ff ff jg 2715 4942: e9 ff df ff ff jmpq 2946 4947: 48 8d 35 c6 53 00 00 lea 0x53c6(%rip),%rsi # 9d14 494e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4954: bf 00 2f 68 59 mov $0x59682f00,%edi 4959: e8 e2 60 00 00 callq aa40 495e: bf 01 00 00 00 mov $0x1,%edi 4963: b8 01 00 00 00 mov $0x1,%eax 4968: 48 8d 35 59 71 00 00 lea 0x7159(%rip),%rsi # bac8 <_IO_stdin_used+0xac8> 496f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4973: e8 08 c7 ff ff callq 1080 <__printf_chk@plt> 4978: 41 83 fc 01 cmp $0x1,%r12d 497c: 0f 8f 42 dd ff ff jg 26c4 4982: e9 bf df ff ff jmpq 2946 4987: 48 8d 35 a4 52 00 00 lea 0x52a4(%rip),%rsi # 9c32 498e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4994: bf 00 2f 68 59 mov $0x59682f00,%edi 4999: e8 a2 60 00 00 callq aa40 499e: bf 01 00 00 00 mov $0x1,%edi 49a3: b8 01 00 00 00 mov $0x1,%eax 49a8: 48 8d 35 a6 6a 00 00 lea 0x6aa6(%rip),%rsi # b455 <_IO_stdin_used+0x455> 49af: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 49b3: e8 c8 c6 ff ff callq 1080 <__printf_chk@plt> 49b8: 41 83 fc 01 cmp $0x1,%r12d 49bc: 0f 8f b1 dc ff ff jg 2673 49c2: e9 7f df ff ff jmpq 2946 49c7: 48 8d 35 b0 51 00 00 lea 0x51b0(%rip),%rsi # 9b7e 49ce: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 49d4: bf 00 2f 68 59 mov $0x59682f00,%edi 49d9: e8 62 60 00 00 callq aa40 49de: bf 01 00 00 00 mov $0x1,%edi 49e3: b8 01 00 00 00 mov $0x1,%eax 49e8: 48 8d 35 81 6a 00 00 lea 0x6a81(%rip),%rsi # b470 <_IO_stdin_used+0x470> 49ef: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 49f3: e8 88 c6 ff ff callq 1080 <__printf_chk@plt> 49f8: 41 83 fc 01 cmp $0x1,%r12d 49fc: 0f 8f 20 dc ff ff jg 2622 4a02: e9 3f df ff ff jmpq 2946 4a07: 48 8d 35 1c 50 00 00 lea 0x501c(%rip),%rsi # 9a2a 4a0e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4a14: bf 00 2f 68 59 mov $0x59682f00,%edi 4a19: e8 22 60 00 00 callq aa40 4a1e: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 4a24: bf 01 00 00 00 mov $0x1,%edi 4a29: 48 8d 35 c8 70 00 00 lea 0x70c8(%rip),%rsi # baf8 <_IO_stdin_used+0xaf8> 4a30: b8 01 00 00 00 mov $0x1,%eax 4a35: f3 0f 5e e8 divss %xmm0,%xmm5 4a39: 66 0f ef c0 pxor %xmm0,%xmm0 4a3d: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 4a41: e8 3a c6 ff ff callq 1080 <__printf_chk@plt> 4a46: 41 83 fc 01 cmp $0x1,%r12d 4a4a: 0f 8f 81 db ff ff jg 25d1 4a50: e9 f1 de ff ff jmpq 2946 4a55: 48 8d 35 6e 50 00 00 lea 0x506e(%rip),%rsi # 9aca 4a5c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4a62: bf 00 2f 68 59 mov $0x59682f00,%edi 4a67: e8 d4 5f 00 00 callq aa40 4a6c: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 4a72: bf 01 00 00 00 mov $0x1,%edi 4a77: 48 8d 35 a2 70 00 00 lea 0x70a2(%rip),%rsi # bb20 <_IO_stdin_used+0xb20> 4a7e: b8 01 00 00 00 mov $0x1,%eax 4a83: f3 0f 5e e8 divss %xmm0,%xmm5 4a87: 66 0f ef c0 pxor %xmm0,%xmm0 4a8b: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 4a8f: e8 ec c5 ff ff callq 1080 <__printf_chk@plt> 4a94: 41 83 fc 01 cmp $0x1,%r12d 4a98: 0f 8f d4 da ff ff jg 2572 4a9e: e9 a3 de ff ff jmpq 2946 4aa3: 48 8d 35 4a 31 00 00 lea 0x314a(%rip),%rsi # 7bf4 4aaa: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4ab0: bf 00 2f 68 59 mov $0x59682f00,%edi 4ab5: e8 86 5f 00 00 callq aa40 4aba: bf 01 00 00 00 mov $0x1,%edi 4abf: b8 01 00 00 00 mov $0x1,%eax 4ac4: 48 8d 35 7d 70 00 00 lea 0x707d(%rip),%rsi # bb48 <_IO_stdin_used+0xb48> 4acb: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4acf: e8 ac c5 ff ff callq 1080 <__printf_chk@plt> 4ad4: 41 83 fc 01 cmp $0x1,%r12d 4ad8: 0f 8f 35 da ff ff jg 2513 4ade: e9 63 de ff ff jmpq 2946 4ae3: 48 8d 35 ae 4a 00 00 lea 0x4aae(%rip),%rsi # 9598 4aea: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4af0: bf 05 7a d7 03 mov $0x3d77a05,%edi 4af5: 48 c1 e7 09 shl $0x9,%rdi 4af9: e8 42 5f 00 00 callq aa40 4afe: bf 01 00 00 00 mov $0x1,%edi 4b03: b8 01 00 00 00 mov $0x1,%eax 4b08: 48 8d 35 61 70 00 00 lea 0x7061(%rip),%rsi # bb70 <_IO_stdin_used+0xb70> 4b0f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4b13: e8 68 c5 ff ff callq 1080 <__printf_chk@plt> 4b18: 41 83 fc 01 cmp $0x1,%r12d 4b1c: 0f 8f a0 d9 ff ff jg 24c2 4b22: e9 1f de ff ff jmpq 2946 4b27: 48 8d 35 54 47 00 00 lea 0x4754(%rip),%rsi # 9282 4b2e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4b34: bf 05 7a d7 03 mov $0x3d77a05,%edi 4b39: 48 c1 e7 09 shl $0x9,%rdi 4b3d: e8 fe 5e 00 00 callq aa40 4b42: bf 01 00 00 00 mov $0x1,%edi 4b47: b8 01 00 00 00 mov $0x1,%eax 4b4c: 48 8d 35 4d 70 00 00 lea 0x704d(%rip),%rsi # bba0 <_IO_stdin_used+0xba0> 4b53: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4b57: e8 24 c5 ff ff callq 1080 <__printf_chk@plt> 4b5c: 41 83 fc 01 cmp $0x1,%r12d 4b60: 0f 8f b2 d8 ff ff jg 2418 4b66: e9 db dd ff ff jmpq 2946 4b6b: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4b71: bf 05 7a d7 03 mov $0x3d77a05,%edi 4b76: 48 8d 35 b3 5e 00 00 lea 0x5eb3(%rip),%rsi # aa30 4b7d: 48 c1 e7 09 shl $0x9,%rdi 4b81: e8 ba 5e 00 00 callq aa40 4b86: bf 01 00 00 00 mov $0x1,%edi 4b8b: b8 01 00 00 00 mov $0x1,%eax 4b90: 48 8d 35 31 70 00 00 lea 0x7031(%rip),%rsi # bbc8 <_IO_stdin_used+0xbc8> 4b97: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4b9b: e8 e0 c4 ff ff callq 1080 <__printf_chk@plt> 4ba0: 41 83 fc 01 cmp $0x1,%r12d 4ba4: 0f 8f 1d d8 ff ff jg 23c7 4baa: e9 97 dd ff ff jmpq 2946 4baf: 48 8d 35 1b 46 00 00 lea 0x461b(%rip),%rsi # 91d1 4bb6: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4bbc: bf 05 7a d7 03 mov $0x3d77a05,%edi 4bc1: 48 c1 e7 09 shl $0x9,%rdi 4bc5: e8 76 5e 00 00 callq aa40 4bca: bf 01 00 00 00 mov $0x1,%edi 4bcf: b8 01 00 00 00 mov $0x1,%eax 4bd4: 48 8d 35 1d 70 00 00 lea 0x701d(%rip),%rsi # bbf8 <_IO_stdin_used+0xbf8> 4bdb: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4bdf: e8 9c c4 ff ff callq 1080 <__printf_chk@plt> 4be4: 41 83 fc 01 cmp $0x1,%r12d 4be8: 0f 8f 84 d7 ff ff jg 2372 4bee: e9 53 dd ff ff jmpq 2946 4bf3: 48 8d 35 28 43 00 00 lea 0x4328(%rip),%rsi # 8f22 4bfa: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4c00: bf 05 7a d7 03 mov $0x3d77a05,%edi 4c05: 48 c1 e7 09 shl $0x9,%rdi 4c09: e8 32 5e 00 00 callq aa40 4c0e: bf 01 00 00 00 mov $0x1,%edi 4c13: b8 01 00 00 00 mov $0x1,%eax 4c18: 48 8d 35 01 70 00 00 lea 0x7001(%rip),%rsi # bc20 <_IO_stdin_used+0xc20> 4c1f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4c23: e8 58 c4 ff ff callq 1080 <__printf_chk@plt> 4c28: 41 83 fc 01 cmp $0x1,%r12d 4c2c: 0f 8f eb d6 ff ff jg 231d 4c32: e9 0f dd ff ff jmpq 2946 4c37: 48 8d 35 ff 3d 00 00 lea 0x3dff(%rip),%rsi # 8a3d 4c3e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4c44: bf 00 2f 68 59 mov $0x59682f00,%edi 4c49: e8 f2 5d 00 00 callq aa40 4c4e: bf 01 00 00 00 mov $0x1,%edi 4c53: b8 01 00 00 00 mov $0x1,%eax 4c58: 48 8d 35 2c 68 00 00 lea 0x682c(%rip),%rsi # b48b <_IO_stdin_used+0x48b> 4c5f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4c63: e8 18 c4 ff ff callq 1080 <__printf_chk@plt> 4c68: 41 83 fc 01 cmp $0x1,%r12d 4c6c: 0f 8f 56 d6 ff ff jg 22c8 4c72: e9 cf dc ff ff jmpq 2946 4c77: 48 8d 35 5e 3e 00 00 lea 0x3e5e(%rip),%rsi # 8adc 4c7e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4c84: bf 00 2f 68 59 mov $0x59682f00,%edi 4c89: e8 b2 5d 00 00 callq aa40 4c8e: bf 01 00 00 00 mov $0x1,%edi 4c93: b8 01 00 00 00 mov $0x1,%eax 4c98: 48 8d 35 08 68 00 00 lea 0x6808(%rip),%rsi # b4a7 <_IO_stdin_used+0x4a7> 4c9f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4ca3: e8 d8 c3 ff ff callq 1080 <__printf_chk@plt> 4ca8: 41 83 fc 01 cmp $0x1,%r12d 4cac: 0f 8f c5 d5 ff ff jg 2277 4cb2: e9 8f dc ff ff jmpq 2946 4cb7: 48 8d 35 13 4b 00 00 lea 0x4b13(%rip),%rsi # 97d1 4cbe: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4cc4: bf 00 2f 68 59 mov $0x59682f00,%edi 4cc9: e8 72 5d 00 00 callq aa40 4cce: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 4cd4: bf 01 00 00 00 mov $0x1,%edi 4cd9: 48 8d 35 68 6f 00 00 lea 0x6f68(%rip),%rsi # bc48 <_IO_stdin_used+0xc48> 4ce0: b8 01 00 00 00 mov $0x1,%eax 4ce5: f3 0f 5e f0 divss %xmm0,%xmm6 4ce9: 66 0f ef c0 pxor %xmm0,%xmm0 4ced: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 4cf1: e8 8a c3 ff ff callq 1080 <__printf_chk@plt> 4cf6: 41 83 fc 01 cmp $0x1,%r12d 4cfa: 0f 8f 26 d5 ff ff jg 2226 4d00: e9 41 dc ff ff jmpq 2946 4d05: 48 8d 35 0d 4a 00 00 lea 0x4a0d(%rip),%rsi # 9719 4d0c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4d12: bf 00 2f 68 59 mov $0x59682f00,%edi 4d17: e8 24 5d 00 00 callq aa40 4d1c: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 4d22: bf 01 00 00 00 mov $0x1,%edi 4d27: 48 8d 35 42 6f 00 00 lea 0x6f42(%rip),%rsi # bc70 <_IO_stdin_used+0xc70> 4d2e: b8 01 00 00 00 mov $0x1,%eax 4d33: f3 0f 5e f8 divss %xmm0,%xmm7 4d37: 66 0f ef c0 pxor %xmm0,%xmm0 4d3b: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 4d3f: e8 3c c3 ff ff callq 1080 <__printf_chk@plt> 4d44: 41 83 fc 01 cmp $0x1,%r12d 4d48: 0f 8f 79 d4 ff ff jg 21c7 4d4e: e9 f3 db ff ff jmpq 2946 4d53: 48 8d 35 0d 41 00 00 lea 0x410d(%rip),%rsi # 8e67 4d5a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4d60: bf 00 2f 68 59 mov $0x59682f00,%edi 4d65: e8 d6 5c 00 00 callq aa40 4d6a: bf 01 00 00 00 mov $0x1,%edi 4d6f: b8 01 00 00 00 mov $0x1,%eax 4d74: 48 8d 35 48 67 00 00 lea 0x6748(%rip),%rsi # b4c3 <_IO_stdin_used+0x4c3> 4d7b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4d7f: e8 fc c2 ff ff callq 1080 <__printf_chk@plt> 4d84: 41 83 fc 01 cmp $0x1,%r12d 4d88: 0f 8f da d3 ff ff jg 2168 4d8e: e9 b3 db ff ff jmpq 2946 4d93: 48 8d 35 15 40 00 00 lea 0x4015(%rip),%rsi # 8daf 4d9a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4da0: bf 00 2f 68 59 mov $0x59682f00,%edi 4da5: e8 96 5c 00 00 callq aa40 4daa: bf 01 00 00 00 mov $0x1,%edi 4daf: b8 01 00 00 00 mov $0x1,%eax 4db4: 48 8d 35 23 67 00 00 lea 0x6723(%rip),%rsi # b4de <_IO_stdin_used+0x4de> 4dbb: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4dbf: e8 bc c2 ff ff callq 1080 <__printf_chk@plt> 4dc4: 41 83 fc 01 cmp $0x1,%r12d 4dc8: 0f 8f 49 d3 ff ff jg 2117 4dce: e9 73 db ff ff jmpq 2946 4dd3: 48 8d 35 43 3a 00 00 lea 0x3a43(%rip),%rsi # 881d 4dda: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4de0: 4c 89 f7 mov %r14,%rdi 4de3: e8 58 5c 00 00 callq aa40 4de8: bf 01 00 00 00 mov $0x1,%edi 4ded: b8 01 00 00 00 mov $0x1,%eax 4df2: 48 8d 35 9f 6e 00 00 lea 0x6e9f(%rip),%rsi # bc98 <_IO_stdin_used+0xc98> 4df9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4dfd: e8 7e c2 ff ff callq 1080 <__printf_chk@plt> 4e02: 41 83 fc 01 cmp $0x1,%r12d 4e06: 0f 8f ba d2 ff ff jg 20c6 4e0c: e9 35 db ff ff jmpq 2946 4e11: 48 8d 35 9c 35 00 00 lea 0x359c(%rip),%rsi # 83b4 4e18: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4e1e: 4c 89 f7 mov %r14,%rdi 4e21: e8 1a 5c 00 00 callq aa40 4e26: bf 01 00 00 00 mov $0x1,%edi 4e2b: b8 01 00 00 00 mov $0x1,%eax 4e30: 48 8d 35 89 6e 00 00 lea 0x6e89(%rip),%rsi # bcc0 <_IO_stdin_used+0xcc0> 4e37: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4e3b: e8 40 c2 ff ff callq 1080 <__printf_chk@plt> 4e40: 41 83 fc 01 cmp $0x1,%r12d 4e44: 0f 8f 2d d2 ff ff jg 2077 4e4a: e9 f7 da ff ff jmpq 2946 4e4f: 48 8d 35 f6 4a 00 00 lea 0x4af6(%rip),%rsi # 994c 4e56: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4e5c: 4c 89 f7 mov %r14,%rdi 4e5f: e8 dc 5b 00 00 callq aa40 4e64: bf 01 00 00 00 mov $0x1,%edi 4e69: b8 01 00 00 00 mov $0x1,%eax 4e6e: 48 8d 35 73 6e 00 00 lea 0x6e73(%rip),%rsi # bce8 <_IO_stdin_used+0xce8> 4e75: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4e79: e8 02 c2 ff ff callq 1080 <__printf_chk@plt> 4e7e: 41 83 fc 01 cmp $0x1,%r12d 4e82: 0f 8f a0 d1 ff ff jg 2028 4e88: e9 b9 da ff ff jmpq 2946 4e8d: 48 8d 35 27 4b 00 00 lea 0x4b27(%rip),%rsi # 99bb 4e94: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4e9a: 4c 89 f7 mov %r14,%rdi 4e9d: e8 9e 5b 00 00 callq aa40 4ea2: bf 01 00 00 00 mov $0x1,%edi 4ea7: b8 01 00 00 00 mov $0x1,%eax 4eac: 48 8d 35 55 6e 00 00 lea 0x6e55(%rip),%rsi # bd08 <_IO_stdin_used+0xd08> 4eb3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 4eb7: e8 c4 c1 ff ff callq 1080 <__printf_chk@plt> 4ebc: 41 83 fc 01 cmp $0x1,%r12d 4ec0: 0f 8f 13 d1 ff ff jg 1fd9 4ec6: e9 7b da ff ff jmpq 2946 4ecb: 48 8d 35 1a 4a 00 00 lea 0x4a1a(%rip),%rsi # 98ec 4ed2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4ed8: bf 00 2f 68 59 mov $0x59682f00,%edi 4edd: e8 5e 5b 00 00 callq aa40 4ee2: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 4ee8: bf 01 00 00 00 mov $0x1,%edi 4eed: 48 8d 35 34 6e 00 00 lea 0x6e34(%rip),%rsi # bd28 <_IO_stdin_used+0xd28> 4ef4: b8 01 00 00 00 mov $0x1,%eax 4ef9: f3 0f 5e e8 divss %xmm0,%xmm5 4efd: 66 0f ef c0 pxor %xmm0,%xmm0 4f01: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 4f05: e8 76 c1 ff ff callq 1080 <__printf_chk@plt> 4f0a: 41 83 fc 01 cmp $0x1,%r12d 4f0e: 0f 8f 76 d0 ff ff jg 1f8a 4f14: e9 2d da ff ff jmpq 2946 4f19: 48 8d 35 6c 49 00 00 lea 0x496c(%rip),%rsi # 988c 4f20: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4f26: bf 00 2f 68 59 mov $0x59682f00,%edi 4f2b: e8 10 5b 00 00 callq aa40 4f30: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 4f36: bf 01 00 00 00 mov $0x1,%edi 4f3b: 48 8d 35 0e 6e 00 00 lea 0x6e0e(%rip),%rsi # bd50 <_IO_stdin_used+0xd50> 4f42: b8 01 00 00 00 mov $0x1,%eax 4f47: f3 0f 5e f0 divss %xmm0,%xmm6 4f4b: 66 0f ef c0 pxor %xmm0,%xmm0 4f4f: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 4f53: e8 28 c1 ff ff callq 1080 <__printf_chk@plt> 4f58: 41 83 fc 01 cmp $0x1,%r12d 4f5c: 0f 8f c9 cf ff ff jg 1f2b 4f62: e9 df d9 ff ff jmpq 2946 4f67: 48 8d 35 0d 3c 00 00 lea 0x3c0d(%rip),%rsi # 8b7b 4f6e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4f74: bf 00 2f 68 59 mov $0x59682f00,%edi 4f79: e8 c2 5a 00 00 callq aa40 4f7e: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 4f84: bf 01 00 00 00 mov $0x1,%edi 4f89: 48 8d 35 e8 6d 00 00 lea 0x6de8(%rip),%rsi # bd78 <_IO_stdin_used+0xd78> 4f90: b8 01 00 00 00 mov $0x1,%eax 4f95: f3 0f 5e f8 divss %xmm0,%xmm7 4f99: 66 0f ef c0 pxor %xmm0,%xmm0 4f9d: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 4fa1: e8 da c0 ff ff callq 1080 <__printf_chk@plt> 4fa6: 41 83 fc 01 cmp $0x1,%r12d 4faa: 0f 8f 1c cf ff ff jg 1ecc 4fb0: e9 91 d9 ff ff jmpq 2946 4fb5: 48 8d 35 05 3a 00 00 lea 0x3a05(%rip),%rsi # 89c1 4fbc: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 4fc2: bf 00 2f 68 59 mov $0x59682f00,%edi 4fc7: e8 74 5a 00 00 callq aa40 4fcc: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 4fd2: bf 01 00 00 00 mov $0x1,%edi 4fd7: 48 8d 35 c2 6d 00 00 lea 0x6dc2(%rip),%rsi # bda0 <_IO_stdin_used+0xda0> 4fde: b8 01 00 00 00 mov $0x1,%eax 4fe3: f3 0f 5e e8 divss %xmm0,%xmm5 4fe7: 66 0f ef c0 pxor %xmm0,%xmm0 4feb: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 4fef: e8 8c c0 ff ff callq 1080 <__printf_chk@plt> 4ff4: 41 83 fc 01 cmp $0x1,%r12d 4ff8: 0f 8f 6f ce ff ff jg 1e6d 4ffe: e9 43 d9 ff ff jmpq 2946 5003: 48 8d 35 92 38 00 00 lea 0x3892(%rip),%rsi # 889c 500a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5010: bf 00 2f 68 59 mov $0x59682f00,%edi 5015: e8 26 5a 00 00 callq aa40 501a: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 5020: bf 01 00 00 00 mov $0x1,%edi 5025: 48 8d 35 9c 6d 00 00 lea 0x6d9c(%rip),%rsi # bdc8 <_IO_stdin_used+0xdc8> 502c: b8 01 00 00 00 mov $0x1,%eax 5031: f3 0f 5e f0 divss %xmm0,%xmm6 5035: 66 0f ef c0 pxor %xmm0,%xmm0 5039: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 503d: e8 3e c0 ff ff callq 1080 <__printf_chk@plt> 5042: 41 83 fc 01 cmp $0x1,%r12d 5046: 0f 8f c2 cd ff ff jg 1e0e 504c: e9 f5 d8 ff ff jmpq 2946 5051: 48 8d 35 f1 32 00 00 lea 0x32f1(%rip),%rsi # 8349 5058: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 505e: 4c 89 f7 mov %r14,%rdi 5061: e8 da 59 00 00 callq aa40 5066: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 506c: bf 01 00 00 00 mov $0x1,%edi 5071: 48 8d 35 88 6d 00 00 lea 0x6d88(%rip),%rsi # be00 <_IO_stdin_used+0xe00> 5078: b8 01 00 00 00 mov $0x1,%eax 507d: f3 0f 5e f8 divss %xmm0,%xmm7 5081: 66 0f ef c0 pxor %xmm0,%xmm0 5085: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 5089: e8 f2 bf ff ff callq 1080 <__printf_chk@plt> 508e: 41 83 fc 01 cmp $0x1,%r12d 5092: 0f 8f 17 cd ff ff jg 1daf 5098: e9 a9 d8 ff ff jmpq 2946 509d: 48 8d 35 f7 31 00 00 lea 0x31f7(%rip),%rsi # 829b 50a4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 50aa: bf 00 2f 68 59 mov $0x59682f00,%edi 50af: e8 8c 59 00 00 callq aa40 50b4: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 50ba: bf 01 00 00 00 mov $0x1,%edi 50bf: 48 8d 35 6a 6d 00 00 lea 0x6d6a(%rip),%rsi # be30 <_IO_stdin_used+0xe30> 50c6: b8 01 00 00 00 mov $0x1,%eax 50cb: f3 0f 5e e8 divss %xmm0,%xmm5 50cf: 66 0f ef c0 pxor %xmm0,%xmm0 50d3: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 50d7: e8 a4 bf ff ff callq 1080 <__printf_chk@plt> 50dc: 41 83 fc 01 cmp $0x1,%r12d 50e0: 0f 8f 6c cc ff ff jg 1d52 50e6: e9 5b d8 ff ff jmpq 2946 50eb: 48 8d 35 be 2d 00 00 lea 0x2dbe(%rip),%rsi # 7eb0 50f2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 50f8: 4c 89 f7 mov %r14,%rdi 50fb: e8 40 59 00 00 callq aa40 5100: f3 0f 10 74 24 0c movss 0xc(%rsp),%xmm6 5106: bf 01 00 00 00 mov $0x1,%edi 510b: 48 8d 35 56 6d 00 00 lea 0x6d56(%rip),%rsi # be68 <_IO_stdin_used+0xe68> 5112: b8 01 00 00 00 mov $0x1,%eax 5117: f3 0f 5e f0 divss %xmm0,%xmm6 511b: 66 0f ef c0 pxor %xmm0,%xmm0 511f: f3 0f 5a c6 cvtss2sd %xmm6,%xmm0 5123: e8 58 bf ff ff callq 1080 <__printf_chk@plt> 5128: 41 83 fc 01 cmp $0x1,%r12d 512c: 0f 8f c1 cb ff ff jg 1cf3 5132: e9 0f d8 ff ff jmpq 2946 5137: 48 8d 35 62 2b 00 00 lea 0x2b62(%rip),%rsi # 7ca0 513e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5144: 4c 89 f7 mov %r14,%rdi 5147: e8 f4 58 00 00 callq aa40 514c: bf 01 00 00 00 mov $0x1,%edi 5151: b8 01 00 00 00 mov $0x1,%eax 5156: 48 8d 35 3b 6d 00 00 lea 0x6d3b(%rip),%rsi # be98 <_IO_stdin_used+0xe98> 515d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5161: e8 1a bf ff ff callq 1080 <__printf_chk@plt> 5166: 41 83 fc 01 cmp $0x1,%r12d 516a: 0f 8f 26 cb ff ff jg 1c96 5170: e9 d1 d7 ff ff jmpq 2946 5175: 48 8d 35 cf 29 00 00 lea 0x29cf(%rip),%rsi # 7b4b 517c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5182: 4c 89 f7 mov %r14,%rdi 5185: e8 b6 58 00 00 callq aa40 518a: bf 01 00 00 00 mov $0x1,%edi 518f: b8 01 00 00 00 mov $0x1,%eax 5194: 48 8d 35 2d 6d 00 00 lea 0x6d2d(%rip),%rsi # bec8 <_IO_stdin_used+0xec8> 519b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 519f: e8 dc be ff ff callq 1080 <__printf_chk@plt> 51a4: 41 83 fc 01 cmp $0x1,%r12d 51a8: 0f 8f 99 ca ff ff jg 1c47 51ae: e9 93 d7 ff ff jmpq 2946 51b3: 48 8d 35 4f 2c 00 00 lea 0x2c4f(%rip),%rsi # 7e09 51ba: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 51c0: 4c 89 f7 mov %r14,%rdi 51c3: e8 78 58 00 00 callq aa40 51c8: bf 01 00 00 00 mov $0x1,%edi 51cd: b8 01 00 00 00 mov $0x1,%eax 51d2: 48 8d 35 1f 6d 00 00 lea 0x6d1f(%rip),%rsi # bef8 <_IO_stdin_used+0xef8> 51d9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 51dd: e8 9e be ff ff callq 1080 <__printf_chk@plt> 51e2: 41 83 fc 01 cmp $0x1,%r12d 51e6: 0f 8f 0c ca ff ff jg 1bf8 51ec: e9 55 d7 ff ff jmpq 2946 51f1: 48 8d 35 4c 2b 00 00 lea 0x2b4c(%rip),%rsi # 7d44 51f8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 51fe: 4c 89 f7 mov %r14,%rdi 5201: e8 3a 58 00 00 callq aa40 5206: bf 01 00 00 00 mov $0x1,%edi 520b: b8 01 00 00 00 mov $0x1,%eax 5210: 48 8d 35 19 6d 00 00 lea 0x6d19(%rip),%rsi # bf30 <_IO_stdin_used+0xf30> 5217: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 521b: e8 60 be ff ff callq 1080 <__printf_chk@plt> 5220: 41 83 fc 01 cmp $0x1,%r12d 5224: 0f 8f 7f c9 ff ff jg 1ba9 522a: e9 17 d7 ff ff jmpq 2946 522f: 48 8d 35 55 26 00 00 lea 0x2655(%rip),%rsi # 788b 5236: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 523c: 4c 89 f7 mov %r14,%rdi 523f: e8 fc 57 00 00 callq aa40 5244: bf 01 00 00 00 mov $0x1,%edi 5249: b8 01 00 00 00 mov $0x1,%eax 524e: 48 8d 35 13 6d 00 00 lea 0x6d13(%rip),%rsi # bf68 <_IO_stdin_used+0xf68> 5255: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5259: e8 22 be ff ff callq 1080 <__printf_chk@plt> 525e: 41 83 fc 01 cmp $0x1,%r12d 5262: 0f 8f f2 c8 ff ff jg 1b5a 5268: e9 d9 d6 ff ff jmpq 2946 526d: 48 8d 35 02 21 00 00 lea 0x2102(%rip),%rsi # 7376 5274: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 527a: 4c 89 f7 mov %r14,%rdi 527d: e8 be 57 00 00 callq aa40 5282: bf 01 00 00 00 mov $0x1,%edi 5287: b8 01 00 00 00 mov $0x1,%eax 528c: 48 8d 35 fd 6c 00 00 lea 0x6cfd(%rip),%rsi # bf90 <_IO_stdin_used+0xf90> 5293: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5297: e8 e4 bd ff ff callq 1080 <__printf_chk@plt> 529c: 41 83 fc 01 cmp $0x1,%r12d 52a0: 0f 8f 65 c8 ff ff jg 1b0b 52a6: e9 9b d6 ff ff jmpq 2946 52ab: 48 8d 35 21 20 00 00 lea 0x2021(%rip),%rsi # 72d3 52b2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 52b8: 4c 89 f7 mov %r14,%rdi 52bb: e8 80 57 00 00 callq aa40 52c0: bf 01 00 00 00 mov $0x1,%edi 52c5: b8 01 00 00 00 mov $0x1,%eax 52ca: 48 8d 35 28 62 00 00 lea 0x6228(%rip),%rsi # b4f9 <_IO_stdin_used+0x4f9> 52d1: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 52d5: e8 a6 bd ff ff callq 1080 <__printf_chk@plt> 52da: 41 83 fc 01 cmp $0x1,%r12d 52de: 0f 8f d8 c7 ff ff jg 1abc 52e4: e9 5d d6 ff ff jmpq 2946 52e9: 48 8d 35 a3 1e 00 00 lea 0x1ea3(%rip),%rsi # 7193 52f0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 52f6: 4c 89 f7 mov %r14,%rdi 52f9: e8 42 57 00 00 callq aa40 52fe: bf 01 00 00 00 mov $0x1,%edi 5303: b8 01 00 00 00 mov $0x1,%eax 5308: 48 8d 35 03 62 00 00 lea 0x6203(%rip),%rsi # b512 <_IO_stdin_used+0x512> 530f: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5313: e8 68 bd ff ff callq 1080 <__printf_chk@plt> 5318: 41 83 fc 01 cmp $0x1,%r12d 531c: 0f 8f 4b c7 ff ff jg 1a6d 5322: e9 1f d6 ff ff jmpq 2946 5327: 48 8d 35 fb 20 00 00 lea 0x20fb(%rip),%rsi # 7429 532e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5334: 4c 89 f7 mov %r14,%rdi 5337: e8 04 57 00 00 callq aa40 533c: bf 01 00 00 00 mov $0x1,%edi 5341: b8 01 00 00 00 mov $0x1,%eax 5346: 48 8d 35 e0 61 00 00 lea 0x61e0(%rip),%rsi # b52d <_IO_stdin_used+0x52d> 534d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5351: e8 2a bd ff ff callq 1080 <__printf_chk@plt> 5356: 41 83 fc 01 cmp $0x1,%r12d 535a: 0f 8f be c6 ff ff jg 1a1e 5360: e9 e1 d5 ff ff jmpq 2946 5365: 48 8d 35 c4 1e 00 00 lea 0x1ec4(%rip),%rsi # 7230 536c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5372: 4c 89 f7 mov %r14,%rdi 5375: e8 c6 56 00 00 callq aa40 537a: bf 01 00 00 00 mov $0x1,%edi 537f: b8 01 00 00 00 mov $0x1,%eax 5384: 48 8d 35 bd 61 00 00 lea 0x61bd(%rip),%rsi # b548 <_IO_stdin_used+0x548> 538b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 538f: e8 ec bc ff ff callq 1080 <__printf_chk@plt> 5394: 41 83 fc 01 cmp $0x1,%r12d 5398: 0f 8f 31 c6 ff ff jg 19cf 539e: e9 a3 d5 ff ff jmpq 2946 53a3: 48 8d 35 34 1d 00 00 lea 0x1d34(%rip),%rsi # 70de 53aa: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 53b0: 4c 89 f7 mov %r14,%rdi 53b3: e8 88 56 00 00 callq aa40 53b8: bf 01 00 00 00 mov $0x1,%edi 53bd: b8 01 00 00 00 mov $0x1,%eax 53c2: 48 8d 35 92 61 00 00 lea 0x6192(%rip),%rsi # b55b <_IO_stdin_used+0x55b> 53c9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 53cd: e8 ae bc ff ff callq 1080 <__printf_chk@plt> 53d2: 41 83 fc 01 cmp $0x1,%r12d 53d6: 0f 8f a4 c5 ff ff jg 1980 53dc: e9 65 d5 ff ff jmpq 2946 53e1: 48 8d 35 6a 1c 00 00 lea 0x1c6a(%rip),%rsi # 7052 53e8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 53ee: 4c 89 f7 mov %r14,%rdi 53f1: e8 4a 56 00 00 callq aa40 53f6: bf 01 00 00 00 mov $0x1,%edi 53fb: b8 01 00 00 00 mov $0x1,%eax 5400: 48 8d 35 b1 6b 00 00 lea 0x6bb1(%rip),%rsi # bfb8 <_IO_stdin_used+0xfb8> 5407: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 540b: e8 70 bc ff ff callq 1080 <__printf_chk@plt> 5410: 41 83 fc 01 cmp $0x1,%r12d 5414: 0f 8f 17 c5 ff ff jg 1931 541a: e9 27 d5 ff ff jmpq 2946 541f: 48 8d 35 14 1b 00 00 lea 0x1b14(%rip),%rsi # 6f3a 5426: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 542c: 4c 89 f7 mov %r14,%rdi 542f: e8 0c 56 00 00 callq aa40 5434: bf 01 00 00 00 mov $0x1,%edi 5439: b8 01 00 00 00 mov $0x1,%eax 543e: 48 8d 35 31 61 00 00 lea 0x6131(%rip),%rsi # b576 <_IO_stdin_used+0x576> 5445: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5449: e8 32 bc ff ff callq 1080 <__printf_chk@plt> 544e: 41 83 fc 01 cmp $0x1,%r12d 5452: 0f 8f 8a c4 ff ff jg 18e2 5458: e9 e9 d4 ff ff jmpq 2946 545d: 48 8d 35 62 1b 00 00 lea 0x1b62(%rip),%rsi # 6fc6 5464: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 546a: 4c 89 f7 mov %r14,%rdi 546d: e8 ce 55 00 00 callq aa40 5472: bf 01 00 00 00 mov $0x1,%edi 5477: b8 01 00 00 00 mov $0x1,%eax 547c: 48 8d 35 0a 61 00 00 lea 0x610a(%rip),%rsi # b58d <_IO_stdin_used+0x58d> 5483: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5487: e8 f4 bb ff ff callq 1080 <__printf_chk@plt> 548c: 41 83 fc 01 cmp $0x1,%r12d 5490: 0f 8f fd c3 ff ff jg 1893 5496: e9 ab d4 ff ff jmpq 2946 549b: 48 8d 35 42 4f 00 00 lea 0x4f42(%rip),%rsi # a3e4 54a2: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 54a8: 4c 89 f7 mov %r14,%rdi 54ab: e8 90 55 00 00 callq aa40 54b0: bf 01 00 00 00 mov $0x1,%edi 54b5: b8 01 00 00 00 mov $0x1,%eax 54ba: 48 8d 35 e3 60 00 00 lea 0x60e3(%rip),%rsi # b5a4 <_IO_stdin_used+0x5a4> 54c1: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 54c5: e8 b6 bb ff ff callq 1080 <__printf_chk@plt> 54ca: 41 83 fc 01 cmp $0x1,%r12d 54ce: 0f 8f 70 c3 ff ff jg 1844 54d4: e9 6d d4 ff ff jmpq 2946 54d9: 48 8d 35 b3 4f 00 00 lea 0x4fb3(%rip),%rsi # a493 54e0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 54e6: 4c 89 f7 mov %r14,%rdi 54e9: e8 52 55 00 00 callq aa40 54ee: bf 01 00 00 00 mov $0x1,%edi 54f3: b8 01 00 00 00 mov $0x1,%eax 54f8: 48 8d 35 c1 60 00 00 lea 0x60c1(%rip),%rsi # b5c0 <_IO_stdin_used+0x5c0> 54ff: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5503: e8 78 bb ff ff callq 1080 <__printf_chk@plt> 5508: 41 83 fc 01 cmp $0x1,%r12d 550c: 0f 8f e3 c2 ff ff jg 17f5 5512: e9 2f d4 ff ff jmpq 2946 5517: 48 8d 35 12 4e 00 00 lea 0x4e12(%rip),%rsi # a330 551e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5524: 4c 89 f7 mov %r14,%rdi 5527: e8 14 55 00 00 callq aa40 552c: bf 01 00 00 00 mov $0x1,%edi 5531: b8 01 00 00 00 mov $0x1,%eax 5536: 48 8d 35 97 60 00 00 lea 0x6097(%rip),%rsi # b5d4 <_IO_stdin_used+0x5d4> 553d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5541: e8 3a bb ff ff callq 1080 <__printf_chk@plt> 5546: 41 83 fc 01 cmp $0x1,%r12d 554a: 0f 8f 56 c2 ff ff jg 17a6 5550: e9 f1 d3 ff ff jmpq 2946 5555: 48 8d 35 6d 20 00 00 lea 0x206d(%rip),%rsi # 75c9 555c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5562: 4c 89 f7 mov %r14,%rdi 5565: e8 d6 54 00 00 callq aa40 556a: bf 01 00 00 00 mov $0x1,%edi 556f: b8 01 00 00 00 mov $0x1,%eax 5574: 48 8d 35 6d 60 00 00 lea 0x606d(%rip),%rsi # b5e8 <_IO_stdin_used+0x5e8> 557b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 557f: e8 fc ba ff ff callq 1080 <__printf_chk@plt> 5584: 41 83 fc 01 cmp $0x1,%r12d 5588: 0f 8f c9 c1 ff ff jg 1757 558e: e9 b3 d3 ff ff jmpq 2946 5593: 48 8d 35 57 1f 00 00 lea 0x1f57(%rip),%rsi # 74f1 559a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 55a0: 4c 89 f7 mov %r14,%rdi 55a3: e8 98 54 00 00 callq aa40 55a8: bf 01 00 00 00 mov $0x1,%edi 55ad: b8 01 00 00 00 mov $0x1,%eax 55b2: 48 8d 35 45 60 00 00 lea 0x6045(%rip),%rsi # b5fe <_IO_stdin_used+0x5fe> 55b9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 55bd: e8 be ba ff ff callq 1080 <__printf_chk@plt> 55c2: 41 83 fc 01 cmp $0x1,%r12d 55c6: 0f 8f 3c c1 ff ff jg 1708 55cc: e9 75 d3 ff ff jmpq 2946 55d1: 48 8d 35 0f 21 00 00 lea 0x210f(%rip),%rsi # 76e7 55d8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 55de: 4c 89 f7 mov %r14,%rdi 55e1: e8 5a 54 00 00 callq aa40 55e6: bf 01 00 00 00 mov $0x1,%edi 55eb: b8 01 00 00 00 mov $0x1,%eax 55f0: 48 8d 35 20 60 00 00 lea 0x6020(%rip),%rsi # b617 <_IO_stdin_used+0x617> 55f7: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 55fb: e8 80 ba ff ff callq 1080 <__printf_chk@plt> 5600: 41 83 fc 01 cmp $0x1,%r12d 5604: 0f 8f af c0 ff ff jg 16b9 560a: e9 37 d3 ff ff jmpq 2946 560f: 48 8d 35 8e 21 00 00 lea 0x218e(%rip),%rsi # 77a4 5616: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 561c: 4c 89 f7 mov %r14,%rdi 561f: e8 1c 54 00 00 callq aa40 5624: bf 01 00 00 00 mov $0x1,%edi 5629: b8 01 00 00 00 mov $0x1,%eax 562e: 48 8d 35 fd 5f 00 00 lea 0x5ffd(%rip),%rsi # b632 <_IO_stdin_used+0x632> 5635: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5639: e8 42 ba ff ff callq 1080 <__printf_chk@plt> 563e: 41 83 fc 01 cmp $0x1,%r12d 5642: 0f 8f 22 c0 ff ff jg 166a 5648: e9 f9 d2 ff ff jmpq 2946 564d: 48 8d 35 52 16 00 00 lea 0x1652(%rip),%rsi # 6ca6 5654: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 565a: 4c 89 f7 mov %r14,%rdi 565d: e8 de 53 00 00 callq aa40 5662: bf 01 00 00 00 mov $0x1,%edi 5667: b8 01 00 00 00 mov $0x1,%eax 566c: 48 8d 35 65 69 00 00 lea 0x6965(%rip),%rsi # bfd8 <_IO_stdin_used+0xfd8> 5673: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5677: e8 04 ba ff ff callq 1080 <__printf_chk@plt> 567c: 41 83 fc 01 cmp $0x1,%r12d 5680: 0f 8f 95 bf ff ff jg 161b 5686: e9 bb d2 ff ff jmpq 2946 568b: 48 8d 35 5c 52 00 00 lea 0x525c(%rip),%rsi # a8ee 5692: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5698: 4c 89 f7 mov %r14,%rdi 569b: e8 a0 53 00 00 callq aa40 56a0: bf 01 00 00 00 mov $0x1,%edi 56a5: b8 01 00 00 00 mov $0x1,%eax 56aa: 48 8d 35 9e 5f 00 00 lea 0x5f9e(%rip),%rsi # b64f <_IO_stdin_used+0x64f> 56b1: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 56b5: e8 c6 b9 ff ff callq 1080 <__printf_chk@plt> 56ba: 41 83 fc 01 cmp $0x1,%r12d 56be: 0f 8f bd be ff ff jg 1581 56c4: e9 7d d2 ff ff jmpq 2946 56c9: 48 8d 35 a7 51 00 00 lea 0x51a7(%rip),%rsi # a877 56d0: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 56d6: 4c 89 f7 mov %r14,%rdi 56d9: e8 62 53 00 00 callq aa40 56de: bf 01 00 00 00 mov $0x1,%edi 56e3: b8 01 00 00 00 mov $0x1,%eax 56e8: 48 8d 35 77 5f 00 00 lea 0x5f77(%rip),%rsi # b666 <_IO_stdin_used+0x666> 56ef: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 56f3: e8 88 b9 ff ff callq 1080 <__printf_chk@plt> 56f8: 41 83 fc 01 cmp $0x1,%r12d 56fc: 0f 8f 30 be ff ff jg 1532 5702: e9 3f d2 ff ff jmpq 2946 5707: 48 8d 35 67 50 00 00 lea 0x5067(%rip),%rsi # a775 570e: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5714: 4c 89 f7 mov %r14,%rdi 5717: e8 24 53 00 00 callq aa40 571c: bf 01 00 00 00 mov $0x1,%edi 5721: b8 01 00 00 00 mov $0x1,%eax 5726: 48 8d 35 50 5f 00 00 lea 0x5f50(%rip),%rsi # b67d <_IO_stdin_used+0x67d> 572d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5731: e8 4a b9 ff ff callq 1080 <__printf_chk@plt> 5736: 41 83 fc 01 cmp $0x1,%r12d 573a: 0f 8f a3 bd ff ff jg 14e3 5740: e9 01 d2 ff ff jmpq 2946 5745: 48 8d 35 e7 4e 00 00 lea 0x4ee7(%rip),%rsi # a633 574c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5752: 4c 89 f7 mov %r14,%rdi 5755: e8 e6 52 00 00 callq aa40 575a: bf 01 00 00 00 mov $0x1,%edi 575f: b8 01 00 00 00 mov $0x1,%eax 5764: 48 8d 35 2a 5f 00 00 lea 0x5f2a(%rip),%rsi # b695 <_IO_stdin_used+0x695> 576b: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 576f: e8 0c b9 ff ff callq 1080 <__printf_chk@plt> 5774: 41 83 fc 01 cmp $0x1,%r12d 5778: 0f 8f 16 bd ff ff jg 1494 577e: e9 c3 d1 ff ff jmpq 2946 5783: 48 8d 35 74 4f 00 00 lea 0x4f74(%rip),%rsi # a6fe 578a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5790: 4c 89 f7 mov %r14,%rdi 5793: e8 a8 52 00 00 callq aa40 5798: bf 01 00 00 00 mov $0x1,%edi 579d: b8 01 00 00 00 mov $0x1,%eax 57a2: 48 8d 35 04 5f 00 00 lea 0x5f04(%rip),%rsi # b6ad <_IO_stdin_used+0x6ad> 57a9: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 57ad: e8 ce b8 ff ff callq 1080 <__printf_chk@plt> 57b2: 41 83 fc 01 cmp $0x1,%r12d 57b6: 0f 8f 89 bc ff ff jg 1445 57bc: e9 85 d1 ff ff jmpq 2946 57c1: 48 8d 35 f4 4d 00 00 lea 0x4df4(%rip),%rsi # a5bc 57c8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 57ce: 4c 89 f7 mov %r14,%rdi 57d1: e8 6a 52 00 00 callq aa40 57d6: bf 01 00 00 00 mov $0x1,%edi 57db: b8 01 00 00 00 mov $0x1,%eax 57e0: 48 8d 35 41 68 00 00 lea 0x6841(%rip),%rsi # c028 <_IO_stdin_used+0x1028> 57e7: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 57eb: e8 90 b8 ff ff callq 1080 <__printf_chk@plt> 57f0: 41 83 fc 01 cmp $0x1,%r12d 57f4: 0f 8f fc bb ff ff jg 13f6 57fa: e9 47 d1 ff ff jmpq 2946 57ff: 48 8d 35 41 4d 00 00 lea 0x4d41(%rip),%rsi # a547 5806: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 580c: 4c 89 f7 mov %r14,%rdi 580f: e8 2c 52 00 00 callq aa40 5814: bf 01 00 00 00 mov $0x1,%edi 5819: b8 01 00 00 00 mov $0x1,%eax 581e: 48 8d 35 a0 5e 00 00 lea 0x5ea0(%rip),%rsi # b6c5 <_IO_stdin_used+0x6c5> 5825: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5829: e8 52 b8 ff ff callq 1080 <__printf_chk@plt> 582e: 41 83 fc 01 cmp $0x1,%r12d 5832: 0f 8f 6f bb ff ff jg 13a7 5838: e9 09 d1 ff ff jmpq 2946 583d: 48 8d 35 e6 15 00 00 lea 0x15e6(%rip),%rsi # 6e2a 5844: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 584a: 4c 89 f7 mov %r14,%rdi 584d: e8 ee 51 00 00 callq aa40 5852: bf 01 00 00 00 mov $0x1,%edi 5857: b8 01 00 00 00 mov $0x1,%eax 585c: 48 8d 35 80 5e 00 00 lea 0x5e80(%rip),%rsi # b6e3 <_IO_stdin_used+0x6e3> 5863: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5867: e8 14 b8 ff ff callq 1080 <__printf_chk@plt> 586c: 41 83 fc 01 cmp $0x1,%r12d 5870: 0f 8f e2 ba ff ff jg 1358 5876: e9 cb d0 ff ff jmpq 2946 587b: 4c 8d 3d a8 15 00 00 lea 0x15a8(%rip),%r15 # 6e2a 5882: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5888: 4c 89 f7 mov %r14,%rdi 588b: 4c 89 fe mov %r15,%rsi 588e: e8 ad 51 00 00 callq aa40 5893: bf 01 00 00 00 mov $0x1,%edi 5898: b8 01 00 00 00 mov $0x1,%eax 589d: 48 8d 35 5c 5e 00 00 lea 0x5e5c(%rip),%rsi # b700 <_IO_stdin_used+0x700> 58a4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 58a8: e8 d3 b7 ff ff callq 1080 <__printf_chk@plt> 58ad: 41 83 fc 01 cmp $0x1,%r12d 58b1: 0f 8f 5b ba ff ff jg 1312 58b7: e9 8a d0 ff ff jmpq 2946 58bc: 48 8d 35 db 14 00 00 lea 0x14db(%rip),%rsi # 6d9e 58c3: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 58c9: 4c 89 f7 mov %r14,%rdi 58cc: e8 6f 51 00 00 callq aa40 58d1: bf 01 00 00 00 mov $0x1,%edi 58d6: b8 01 00 00 00 mov $0x1,%eax 58db: 48 8d 35 3b 5e 00 00 lea 0x5e3b(%rip),%rsi # b71d <_IO_stdin_used+0x71d> 58e2: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 58e6: e8 95 b7 ff ff callq 1080 <__printf_chk@plt> 58eb: 41 83 fc 01 cmp $0x1,%r12d 58ef: 0f 8f d0 b9 ff ff jg 12c5 58f5: e9 4c d0 ff ff jmpq 2946 58fa: 48 8d 35 3e 14 00 00 lea 0x143e(%rip),%rsi # 6d3f 5901: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5907: 4c 89 f7 mov %r14,%rdi 590a: e8 31 51 00 00 callq aa40 590f: bf 01 00 00 00 mov $0x1,%edi 5914: b8 01 00 00 00 mov $0x1,%eax 5919: 48 8d 35 11 5e 00 00 lea 0x5e11(%rip),%rsi # b731 <_IO_stdin_used+0x731> 5920: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5924: e8 57 b7 ff ff callq 1080 <__printf_chk@plt> 5929: 41 83 fc 01 cmp $0x1,%r12d 592d: 0f 8f 43 b9 ff ff jg 1276 5933: e9 0e d0 ff ff jmpq 2946 5938: 45 31 ed xor %r13d,%r13d 593b: 48 8d 74 24 10 lea 0x10(%rsp),%rsi 5940: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi 5945: e8 06 b7 ff ff callq 1050 594a: 4c 89 f7 mov %r14,%rdi 594d: e8 f7 12 00 00 callq 6c49 5952: 48 8d 74 24 18 lea 0x18(%rsp),%rsi 5957: 48 8d 7c 24 30 lea 0x30(%rsp),%rdi 595c: e8 ef b6 ff ff callq 1050 5961: 48 8b 44 24 38 mov 0x38(%rsp),%rax 5966: 48 2b 44 24 28 sub 0x28(%rsp),%rax 596b: be e8 03 00 00 mov $0x3e8,%esi 5970: 48 99 cqto 5972: 48 8b 4c 24 30 mov 0x30(%rsp),%rcx 5977: 48 2b 4c 24 20 sub 0x20(%rsp),%rcx 597c: 48 f7 fe idiv %rsi 597f: 48 69 c9 e8 03 00 00 imul $0x3e8,%rcx,%rcx 5986: 48 01 c1 add %rax,%rcx 5989: 0f 88 03 0b 00 00 js 6492 598f: 66 0f ef c0 pxor %xmm0,%xmm0 5993: f3 48 0f 2a c1 cvtsi2ss %rcx,%xmm0 5998: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 599c: f2 0f 59 05 d4 66 00 mulsd 0x66d4(%rip),%xmm0 # c078 <_IO_stdin_used+0x1078> 59a3: 00 59a4: 4d 85 f6 test %r14,%r14 59a7: 0f 88 cd 0a 00 00 js 647a 59ad: 66 0f ef c9 pxor %xmm1,%xmm1 59b1: f3 49 0f 2a ce cvtsi2ss %r14,%xmm1 59b6: f3 0f 5a c9 cvtss2sd %xmm1,%xmm1 59ba: f2 0f 5e c1 divsd %xmm1,%xmm0 59be: bf 01 00 00 00 mov $0x1,%edi 59c3: f3 0f 10 2d bd 66 00 movss 0x66bd(%rip),%xmm5 # c088 <_IO_stdin_used+0x1088> 59ca: 00 59cb: 48 8d 35 a6 5d 00 00 lea 0x5da6(%rip),%rsi # b778 <_IO_stdin_used+0x778> 59d2: b8 01 00 00 00 mov $0x1,%eax 59d7: f3 0f 11 6c 24 0c movss %xmm5,0xc(%rsp) 59dd: f2 0f 5a c0 cvtsd2ss %xmm0,%xmm0 59e1: f3 0f 5e e8 divss %xmm0,%xmm5 59e5: 66 0f ef c0 pxor %xmm0,%xmm0 59e9: f3 0f 11 6c 24 08 movss %xmm5,0x8(%rsp) 59ef: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 59f3: e8 88 b6 ff ff callq 1080 <__printf_chk@plt> 59f8: 45 85 ed test %r13d,%r13d 59fb: 0f 84 8f de ff ff je 3890 5a01: 48 8d 35 ef 31 00 00 lea 0x31ef(%rip),%rsi # 8bf7 5a08: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5a0e: bf 00 2f 68 59 mov $0x59682f00,%edi 5a13: e8 28 50 00 00 callq aa40 5a18: bf 01 00 00 00 mov $0x1,%edi 5a1d: b8 01 00 00 00 mov $0x1,%eax 5a22: 48 8d 35 34 56 00 00 lea 0x5634(%rip),%rsi # b05d <_IO_stdin_used+0x5d> 5a29: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5a2d: e8 4e b6 ff ff callq 1080 <__printf_chk@plt> 5a32: 48 8d 35 ff 3b 00 00 lea 0x3bff(%rip),%rsi # 9638 5a39: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5a3f: bf 00 2f 68 59 mov $0x59682f00,%edi 5a44: e8 f7 4f 00 00 callq aa40 5a49: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 5a4f: bf 01 00 00 00 mov $0x1,%edi 5a54: 48 8d 35 45 5d 00 00 lea 0x5d45(%rip),%rsi # b7a0 <_IO_stdin_used+0x7a0> 5a5b: b8 01 00 00 00 mov $0x1,%eax 5a60: f3 0f 5e f8 divss %xmm0,%xmm7 5a64: 66 0f ef c0 pxor %xmm0,%xmm0 5a68: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 5a6c: e8 0f b6 ff ff callq 1080 <__printf_chk@plt> 5a71: 48 8d 35 60 32 00 00 lea 0x3260(%rip),%rsi # 8cd8 5a78: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5a7e: bf 00 2f 68 59 mov $0x59682f00,%edi 5a83: e8 b8 4f 00 00 callq aa40 5a88: bf 01 00 00 00 mov $0x1,%edi 5a8d: b8 01 00 00 00 mov $0x1,%eax 5a92: 48 8d 35 2f 5d 00 00 lea 0x5d2f(%rip),%rsi # b7c8 <_IO_stdin_used+0x7c8> 5a99: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5a9d: e8 de b5 ff ff callq 1080 <__printf_chk@plt> 5aa2: 48 8d 35 20 3a 00 00 lea 0x3a20(%rip),%rsi # 94c9 5aa9: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5aaf: bf 05 7a d7 03 mov $0x3d77a05,%edi 5ab4: 48 c1 e7 09 shl $0x9,%rdi 5ab8: e8 83 4f 00 00 callq aa40 5abd: bf 01 00 00 00 mov $0x1,%edi 5ac2: b8 01 00 00 00 mov $0x1,%eax 5ac7: 48 8d 35 22 5d 00 00 lea 0x5d22(%rip),%rsi # b7f0 <_IO_stdin_used+0x7f0> 5ace: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5ad2: e8 a9 b5 ff ff callq 1080 <__printf_chk@plt> 5ad7: 48 8d 35 af 1f 00 00 lea 0x1faf(%rip),%rsi # 7a8d 5ade: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5ae4: bf 00 2f 68 59 mov $0x59682f00,%edi 5ae9: e8 52 4f 00 00 callq aa40 5aee: bf 01 00 00 00 mov $0x1,%edi 5af3: b8 01 00 00 00 mov $0x1,%eax 5af8: 48 8d 35 21 5d 00 00 lea 0x5d21(%rip),%rsi # b820 <_IO_stdin_used+0x820> 5aff: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5b03: e8 78 b5 ff ff callq 1080 <__printf_chk@plt> 5b08: 48 8d 35 a1 23 00 00 lea 0x23a1(%rip),%rsi # 7eb0 5b0f: 4c 89 f7 mov %r14,%rdi 5b12: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5b18: e8 23 4f 00 00 callq aa40 5b1d: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 5b23: bf 01 00 00 00 mov $0x1,%edi 5b28: 48 8d 35 11 5d 00 00 lea 0x5d11(%rip),%rsi # b840 <_IO_stdin_used+0x840> 5b2f: b8 01 00 00 00 mov $0x1,%eax 5b34: f3 0f 5e f8 divss %xmm0,%xmm7 5b38: 66 0f ef c0 pxor %xmm0,%xmm0 5b3c: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 5b40: e8 3b b5 ff ff callq 1080 <__printf_chk@plt> 5b45: 48 8d 35 c5 1d 00 00 lea 0x1dc5(%rip),%rsi # 7911 5b4c: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5b52: bf 00 2f 68 59 mov $0x59682f00,%edi 5b57: e8 e4 4e 00 00 callq aa40 5b5c: bf 01 00 00 00 mov $0x1,%edi 5b61: b8 01 00 00 00 mov $0x1,%eax 5b66: 48 8d 35 fb 5c 00 00 lea 0x5cfb(%rip),%rsi # b868 <_IO_stdin_used+0x868> 5b6d: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5b71: e8 0a b5 ff ff callq 1080 <__printf_chk@plt> 5b76: 48 8d 35 52 1e 00 00 lea 0x1e52(%rip),%rsi # 79cf 5b7d: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5b83: bf 00 2f 68 59 mov $0x59682f00,%edi 5b88: e8 b3 4e 00 00 callq aa40 5b8d: bf 01 00 00 00 mov $0x1,%edi 5b92: b8 01 00 00 00 mov $0x1,%eax 5b97: 48 8d 35 f2 5c 00 00 lea 0x5cf2(%rip),%rsi # b890 <_IO_stdin_used+0x890> 5b9e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5ba2: e8 d9 b4 ff ff callq 1080 <__printf_chk@plt> 5ba7: 48 8d 35 1b 26 00 00 lea 0x261b(%rip),%rsi # 81c9 5bae: 4c 89 f7 mov %r14,%rdi 5bb1: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5bb7: e8 84 4e 00 00 callq aa40 5bbc: f3 0f 10 6c 24 0c movss 0xc(%rsp),%xmm5 5bc2: bf 01 00 00 00 mov $0x1,%edi 5bc7: 48 8d 35 f2 5c 00 00 lea 0x5cf2(%rip),%rsi # b8c0 <_IO_stdin_used+0x8c0> 5bce: b8 01 00 00 00 mov $0x1,%eax 5bd3: f3 0f 5e e8 divss %xmm0,%xmm5 5bd7: 66 0f ef c0 pxor %xmm0,%xmm0 5bdb: f3 0f 5a c5 cvtss2sd %xmm5,%xmm0 5bdf: e8 9c b4 ff ff callq 1080 <__printf_chk@plt> 5be4: 48 8d 35 3a 24 00 00 lea 0x243a(%rip),%rsi # 8025 5beb: 4c 89 f7 mov %r14,%rdi 5bee: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5bf4: e8 47 4e 00 00 callq aa40 5bf9: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 5bff: bf 01 00 00 00 mov $0x1,%edi 5c04: 48 8d 35 e5 5c 00 00 lea 0x5ce5(%rip),%rsi # b8f0 <_IO_stdin_used+0x8f0> 5c0b: b8 01 00 00 00 mov $0x1,%eax 5c10: f3 0f 5e f8 divss %xmm0,%xmm7 5c14: 66 0f ef c0 pxor %xmm0,%xmm0 5c18: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 5c1c: e8 5f b4 ff ff callq 1080 <__printf_chk@plt> 5c21: 48 8d 35 cf 24 00 00 lea 0x24cf(%rip),%rsi # 80f7 5c28: 4c 89 f7 mov %r14,%rdi 5c2b: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5c31: e8 0a 4e 00 00 callq aa40 5c36: f3 0f 10 7c 24 0c movss 0xc(%rsp),%xmm7 5c3c: bf 01 00 00 00 mov $0x1,%edi 5c41: 48 8d 35 d8 5c 00 00 lea 0x5cd8(%rip),%rsi # b920 <_IO_stdin_used+0x920> 5c48: b8 01 00 00 00 mov $0x1,%eax 5c4d: f3 0f 5e f8 divss %xmm0,%xmm7 5c51: 66 0f ef c0 pxor %xmm0,%xmm0 5c55: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 5c59: e8 22 b4 ff ff callq 1080 <__printf_chk@plt> 5c5e: 48 8d 35 bd 33 00 00 lea 0x33bd(%rip),%rsi # 9022 5c65: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5c6b: bf 05 7a d7 03 mov $0x3d77a05,%edi 5c70: 48 c1 e7 09 shl $0x9,%rdi 5c74: e8 c7 4d 00 00 callq aa40 5c79: bf 01 00 00 00 mov $0x1,%edi 5c7e: b8 01 00 00 00 mov $0x1,%eax 5c83: 48 8d 35 ce 5c 00 00 lea 0x5cce(%rip),%rsi # b958 <_IO_stdin_used+0x958> 5c8a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5c8e: e8 ed b3 ff ff callq 1080 <__printf_chk@plt> 5c93: 48 8d 35 69 34 00 00 lea 0x3469(%rip),%rsi # 9103 5c9a: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5ca0: bf 05 7a d7 03 mov $0x3d77a05,%edi 5ca5: 48 c1 e7 09 shl $0x9,%rdi 5ca9: e8 92 4d 00 00 callq aa40 5cae: bf 01 00 00 00 mov $0x1,%edi 5cb3: b8 01 00 00 00 mov $0x1,%eax 5cb8: 48 8d 35 c1 5c 00 00 lea 0x5cc1(%rip),%rsi # b980 <_IO_stdin_used+0x980> 5cbf: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5cc3: e8 b8 b3 ff ff callq 1080 <__printf_chk@plt> 5cc8: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5cce: 48 8d 35 db 4c 00 00 lea 0x4cdb(%rip),%rsi # a9b0 5cd5: bf 00 2f 68 59 mov $0x59682f00,%edi 5cda: e8 61 4d 00 00 callq aa40 5cdf: bf 01 00 00 00 mov $0x1,%edi 5ce4: b8 01 00 00 00 mov $0x1,%eax 5ce9: 48 8d 35 88 53 00 00 lea 0x5388(%rip),%rsi # b078 <_IO_stdin_used+0x78> 5cf0: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5cf4: e8 87 b3 ff ff callq 1080 <__printf_chk@plt> 5cf9: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5cff: 48 8d 35 fa 4c 00 00 lea 0x4cfa(%rip),%rsi # aa00 5d06: bf 00 2f 68 59 mov $0x59682f00,%edi 5d0b: e8 30 4d 00 00 callq aa40 5d10: bf 01 00 00 00 mov $0x1,%edi 5d15: b8 01 00 00 00 mov $0x1,%eax 5d1a: 48 8d 35 74 53 00 00 lea 0x5374(%rip),%rsi # b095 <_IO_stdin_used+0x95> 5d21: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5d25: e8 56 b3 ff ff callq 1080 <__printf_chk@plt> 5d2a: 48 8d 35 ee 26 00 00 lea 0x26ee(%rip),%rsi # 841f 5d31: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5d37: bf 00 2f 68 59 mov $0x59682f00,%edi 5d3c: e8 ff 4c 00 00 callq aa40 5d41: bf 01 00 00 00 mov $0x1,%edi 5d46: b8 01 00 00 00 mov $0x1,%eax 5d4b: 48 8d 35 61 53 00 00 lea 0x5361(%rip),%rsi # b0b3 <_IO_stdin_used+0xb3> 5d52: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5d56: e8 25 b3 ff ff callq 1080 <__printf_chk@plt> 5d5b: 48 8d 35 21 2a 00 00 lea 0x2a21(%rip),%rsi # 8783 5d62: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5d68: bf 00 2f 68 59 mov $0x59682f00,%edi 5d6d: e8 ce 4c 00 00 callq aa40 5d72: bf 01 00 00 00 mov $0x1,%edi 5d77: b8 01 00 00 00 mov $0x1,%eax 5d7c: 48 8d 35 46 53 00 00 lea 0x5346(%rip),%rsi # b0c9 <_IO_stdin_used+0xc9> 5d83: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5d87: e8 f4 b2 ff ff callq 1080 <__printf_chk@plt> 5d8c: 48 8d 35 26 27 00 00 lea 0x2726(%rip),%rsi # 84b9 5d93: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5d99: bf 00 2f 68 59 mov $0x59682f00,%edi 5d9e: e8 9d 4c 00 00 callq aa40 5da3: bf 01 00 00 00 mov $0x1,%edi 5da8: b8 01 00 00 00 mov $0x1,%eax 5dad: 48 8d 35 fc 5b 00 00 lea 0x5bfc(%rip),%rsi # b9b0 <_IO_stdin_used+0x9b0> 5db4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5db8: e8 c3 b2 ff ff callq 1080 <__printf_chk@plt> 5dbd: 48 8d 35 af 27 00 00 lea 0x27af(%rip),%rsi # 8573 5dc4: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5dca: bf 00 2f 68 59 mov $0x59682f00,%edi 5dcf: e8 6c 4c 00 00 callq aa40 5dd4: bf 01 00 00 00 mov $0x1,%edi 5dd9: b8 01 00 00 00 mov $0x1,%eax 5dde: 48 8d 35 fa 52 00 00 lea 0x52fa(%rip),%rsi # b0df <_IO_stdin_used+0xdf> 5de5: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5de9: e8 92 b2 ff ff callq 1080 <__printf_chk@plt> 5dee: 48 8d 35 f8 28 00 00 lea 0x28f8(%rip),%rsi # 86ed 5df5: f3 0f 10 44 24 08 movss 0x8(%rsp),%xmm0 5dfb: bf 00 2f 68 59 mov $0x59682f00,%edi 5e00: e8 3b 4c 00 00 callq aa40 5e05: bf 01 00 00 00 mov $0x1,%edi 5e0a: b8 01 00 00 00 mov $0x1,%eax 5e0f: 48 8d 35 ba 5b 00 00 lea 0x5bba(%rip),%rsi # b9d0 <_IO_stdin_used+0x9d0> 5e16: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5e1a: e8 61 b2 ff ff callq 1080 <__printf_chk@plt> 5e1f: e9 31 da ff ff jmpq 3855 5e24: ba 0c 00 00 00 mov $0xc,%edx 5e29: 48 8d 35 88 53 00 00 lea 0x5388(%rip),%rsi # b1b8 <_IO_stdin_used+0x1b8> 5e30: 4c 89 ef mov %r13,%rdi 5e33: e8 f8 b1 ff ff callq 1030 5e38: 85 c0 test %eax,%eax 5e3a: 0f 85 2b cb ff ff jne 296b 5e40: 48 8d 35 a6 28 00 00 lea 0x28a6(%rip),%rsi # 86ed 5e47: f3 0f 10 05 39 62 00 movss 0x6239(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 5e4e: 00 5e4f: bf 00 2f 68 59 mov $0x59682f00,%edi 5e54: e8 e7 4b 00 00 callq aa40 5e59: bf 01 00 00 00 mov $0x1,%edi 5e5e: b8 01 00 00 00 mov $0x1,%eax 5e63: 48 8d 35 66 5b 00 00 lea 0x5b66(%rip),%rsi # b9d0 <_IO_stdin_used+0x9d0> 5e6a: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5e6e: e8 0d b2 ff ff callq 1080 <__printf_chk@plt> 5e73: 41 83 fc 01 cmp $0x1,%r12d 5e77: 0f 8f a4 d9 ff ff jg 3821 5e7d: e9 c4 ca ff ff jmpq 2946 5e82: ba 0c 00 00 00 mov $0xc,%edx 5e87: 48 8d 35 1d 53 00 00 lea 0x531d(%rip),%rsi # b1ab <_IO_stdin_used+0x1ab> 5e8e: 4c 89 ef mov %r13,%rdi 5e91: e8 9a b1 ff ff callq 1030 5e96: 85 c0 test %eax,%eax 5e98: 75 8a jne 5e24 5e9a: 48 8d 35 d2 26 00 00 lea 0x26d2(%rip),%rsi # 8573 5ea1: f3 0f 10 05 df 61 00 movss 0x61df(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 5ea8: 00 5ea9: bf 00 2f 68 59 mov $0x59682f00,%edi 5eae: e8 8d 4b 00 00 callq aa40 5eb3: bf 01 00 00 00 mov $0x1,%edi 5eb8: b8 01 00 00 00 mov $0x1,%eax 5ebd: 48 8d 35 1b 52 00 00 lea 0x521b(%rip),%rsi # b0df <_IO_stdin_used+0xdf> 5ec4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5ec8: e8 b3 b1 ff ff callq 1080 <__printf_chk@plt> 5ecd: 41 83 fc 01 cmp $0x1,%r12d 5ed1: 0f 8f f7 d8 ff ff jg 37ce 5ed7: e9 6a ca ff ff jmpq 2946 5edc: ba 0c 00 00 00 mov $0xc,%edx 5ee1: 48 8d 35 b6 52 00 00 lea 0x52b6(%rip),%rsi # b19e <_IO_stdin_used+0x19e> 5ee8: 4c 89 ef mov %r13,%rdi 5eeb: e8 40 b1 ff ff callq 1030 5ef0: 85 c0 test %eax,%eax 5ef2: 75 8e jne 5e82 5ef4: 48 8d 35 be 25 00 00 lea 0x25be(%rip),%rsi # 84b9 5efb: f3 0f 10 05 85 61 00 movss 0x6185(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 5f02: 00 5f03: bf 00 2f 68 59 mov $0x59682f00,%edi 5f08: e8 33 4b 00 00 callq aa40 5f0d: bf 01 00 00 00 mov $0x1,%edi 5f12: b8 01 00 00 00 mov $0x1,%eax 5f17: 48 8d 35 92 5a 00 00 lea 0x5a92(%rip),%rsi # b9b0 <_IO_stdin_used+0x9b0> 5f1e: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5f22: e8 59 b1 ff ff callq 1080 <__printf_chk@plt> 5f27: 41 83 fc 01 cmp $0x1,%r12d 5f2b: 0f 8f 4a d8 ff ff jg 377b 5f31: e9 10 ca ff ff jmpq 2946 5f36: ba 09 00 00 00 mov $0x9,%edx 5f3b: 48 8d 35 52 52 00 00 lea 0x5252(%rip),%rsi # b194 <_IO_stdin_used+0x194> 5f42: 4c 89 ef mov %r13,%rdi 5f45: e8 e6 b0 ff ff callq 1030 5f4a: 85 c0 test %eax,%eax 5f4c: 75 8e jne 5edc 5f4e: 48 8d 35 2e 28 00 00 lea 0x282e(%rip),%rsi # 8783 5f55: f3 0f 10 05 2b 61 00 movss 0x612b(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 5f5c: 00 5f5d: bf 00 2f 68 59 mov $0x59682f00,%edi 5f62: e8 d9 4a 00 00 callq aa40 5f67: bf 01 00 00 00 mov $0x1,%edi 5f6c: b8 01 00 00 00 mov $0x1,%eax 5f71: 48 8d 35 51 51 00 00 lea 0x5151(%rip),%rsi # b0c9 <_IO_stdin_used+0xc9> 5f78: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5f7c: e8 ff b0 ff ff callq 1080 <__printf_chk@plt> 5f81: 41 83 fc 01 cmp $0x1,%r12d 5f85: 0f 8f 9d d7 ff ff jg 3728 5f8b: e9 b6 c9 ff ff jmpq 2946 5f90: ba 09 00 00 00 mov $0x9,%edx 5f95: 48 8d 35 ee 51 00 00 lea 0x51ee(%rip),%rsi # b18a <_IO_stdin_used+0x18a> 5f9c: 4c 89 ef mov %r13,%rdi 5f9f: e8 8c b0 ff ff callq 1030 5fa4: 85 c0 test %eax,%eax 5fa6: 75 8e jne 5f36 5fa8: 48 8d 35 70 24 00 00 lea 0x2470(%rip),%rsi # 841f 5faf: f3 0f 10 05 d1 60 00 movss 0x60d1(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 5fb6: 00 5fb7: bf 00 2f 68 59 mov $0x59682f00,%edi 5fbc: e8 7f 4a 00 00 callq aa40 5fc1: bf 01 00 00 00 mov $0x1,%edi 5fc6: b8 01 00 00 00 mov $0x1,%eax 5fcb: 48 8d 35 e1 50 00 00 lea 0x50e1(%rip),%rsi # b0b3 <_IO_stdin_used+0xb3> 5fd2: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 5fd6: e8 a5 b0 ff ff callq 1080 <__printf_chk@plt> 5fdb: 41 83 fc 01 cmp $0x1,%r12d 5fdf: 0f 8f f0 d6 ff ff jg 36d5 5fe5: e9 5c c9 ff ff jmpq 2946 5fea: ba 07 00 00 00 mov $0x7,%edx 5fef: 48 8d 35 8b 51 00 00 lea 0x518b(%rip),%rsi # b181 <_IO_stdin_used+0x181> 5ff6: 4c 89 ef mov %r13,%rdi 5ff9: e8 32 b0 ff ff callq 1030 5ffe: 85 c0 test %eax,%eax 6000: 75 8e jne 5f90 6002: f3 0f 10 05 7e 60 00 movss 0x607e(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6009: 00 600a: 48 8d 35 ef 49 00 00 lea 0x49ef(%rip),%rsi # aa00 6011: bf 00 2f 68 59 mov $0x59682f00,%edi 6016: e8 25 4a 00 00 callq aa40 601b: bf 01 00 00 00 mov $0x1,%edi 6020: b8 01 00 00 00 mov $0x1,%eax 6025: 48 8d 35 69 50 00 00 lea 0x5069(%rip),%rsi # b095 <_IO_stdin_used+0x95> 602c: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 6030: e8 4b b0 ff ff callq 1080 <__printf_chk@plt> 6035: 41 83 fc 01 cmp $0x1,%r12d 6039: 0f 8f 43 d6 ff ff jg 3682 603f: e9 02 c9 ff ff jmpq 2946 6044: ba 07 00 00 00 mov $0x7,%edx 6049: 48 8d 35 29 51 00 00 lea 0x5129(%rip),%rsi # b179 <_IO_stdin_used+0x179> 6050: 4c 89 ef mov %r13,%rdi 6053: e8 d8 af ff ff callq 1030 6058: 85 c0 test %eax,%eax 605a: 75 8e jne 5fea 605c: f3 0f 10 05 24 60 00 movss 0x6024(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6063: 00 6064: 48 8d 35 45 49 00 00 lea 0x4945(%rip),%rsi # a9b0 606b: bf 00 2f 68 59 mov $0x59682f00,%edi 6070: e8 cb 49 00 00 callq aa40 6075: bf 01 00 00 00 mov $0x1,%edi 607a: b8 01 00 00 00 mov $0x1,%eax 607f: 48 8d 35 f2 4f 00 00 lea 0x4ff2(%rip),%rsi # b078 <_IO_stdin_used+0x78> 6086: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 608a: e8 f1 af ff ff callq 1080 <__printf_chk@plt> 608f: 41 83 fc 01 cmp $0x1,%r12d 6093: 0f 8f 96 d5 ff ff jg 362f 6099: e9 a8 c8 ff ff jmpq 2946 609e: ba 0b 00 00 00 mov $0xb,%edx 60a3: 48 8d 35 bf 50 00 00 lea 0x50bf(%rip),%rsi # b169 <_IO_stdin_used+0x169> 60aa: 4c 89 ef mov %r13,%rdi 60ad: e8 7e af ff ff callq 1030 60b2: 85 c0 test %eax,%eax 60b4: 75 8e jne 6044 60b6: 48 8d 35 46 30 00 00 lea 0x3046(%rip),%rsi # 9103 60bd: f3 0f 10 05 c3 5f 00 movss 0x5fc3(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 60c4: 00 60c5: bf 05 7a d7 03 mov $0x3d77a05,%edi 60ca: 48 c1 e7 09 shl $0x9,%rdi 60ce: e8 6d 49 00 00 callq aa40 60d3: bf 01 00 00 00 mov $0x1,%edi 60d8: b8 01 00 00 00 mov $0x1,%eax 60dd: 48 8d 35 9c 58 00 00 lea 0x589c(%rip),%rsi # b980 <_IO_stdin_used+0x980> 60e4: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 60e8: e8 93 af ff ff callq 1080 <__printf_chk@plt> 60ed: 41 83 fc 01 cmp $0x1,%r12d 60f1: 0f 8f e5 d4 ff ff jg 35dc 60f7: e9 4a c8 ff ff jmpq 2946 60fc: ba 0b 00 00 00 mov $0xb,%edx 6101: 48 8d 35 54 50 00 00 lea 0x5054(%rip),%rsi # b15c <_IO_stdin_used+0x15c> 6108: 4c 89 ef mov %r13,%rdi 610b: e8 20 af ff ff callq 1030 6110: 85 c0 test %eax,%eax 6112: 75 8a jne 609e 6114: 48 8d 35 07 2f 00 00 lea 0x2f07(%rip),%rsi # 9022 611b: f3 0f 10 05 65 5f 00 movss 0x5f65(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6122: 00 6123: bf 05 7a d7 03 mov $0x3d77a05,%edi 6128: 48 c1 e7 09 shl $0x9,%rdi 612c: e8 0f 49 00 00 callq aa40 6131: bf 01 00 00 00 mov $0x1,%edi 6136: b8 01 00 00 00 mov $0x1,%eax 613b: 48 8d 35 16 58 00 00 lea 0x5816(%rip),%rsi # b958 <_IO_stdin_used+0x958> 6142: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 6146: e8 35 af ff ff callq 1080 <__printf_chk@plt> 614b: 41 83 fc 01 cmp $0x1,%r12d 614f: 0f 8f 30 d4 ff ff jg 3585 6155: e9 ec c7 ff ff jmpq 2946 615a: ba 0d 00 00 00 mov $0xd,%edx 615f: 48 8d 35 e7 4f 00 00 lea 0x4fe7(%rip),%rsi # b14d <_IO_stdin_used+0x14d> 6166: 4c 89 ef mov %r13,%rdi 6169: e8 c2 ae ff ff callq 1030 616e: 85 c0 test %eax,%eax 6170: 75 8a jne 60fc 6172: 48 8d 35 7e 1f 00 00 lea 0x1f7e(%rip),%rsi # 80f7 6179: f3 0f 10 05 07 5f 00 movss 0x5f07(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6180: 00 6181: 4c 89 f7 mov %r14,%rdi 6184: e8 b7 48 00 00 callq aa40 6189: bf 01 00 00 00 mov $0x1,%edi 618e: b8 01 00 00 00 mov $0x1,%eax 6193: f3 0f 10 0d ed 5e 00 movss 0x5eed(%rip),%xmm1 # c088 <_IO_stdin_used+0x1088> 619a: 00 619b: 48 8d 35 7e 57 00 00 lea 0x577e(%rip),%rsi # b920 <_IO_stdin_used+0x920> 61a2: f3 0f 5e c8 divss %xmm0,%xmm1 61a6: 66 0f ef c0 pxor %xmm0,%xmm0 61aa: f3 0f 5a c1 cvtss2sd %xmm1,%xmm0 61ae: e8 cd ae ff ff callq 1080 <__printf_chk@plt> 61b3: 41 83 fc 01 cmp $0x1,%r12d 61b7: 0f 8f 71 d3 ff ff jg 352e 61bd: e9 84 c7 ff ff jmpq 2946 61c2: ba 0c 00 00 00 mov $0xc,%edx 61c7: 48 8d 35 72 4f 00 00 lea 0x4f72(%rip),%rsi # b140 <_IO_stdin_used+0x140> 61ce: 4c 89 ef mov %r13,%rdi 61d1: e8 5a ae ff ff callq 1030 61d6: 85 c0 test %eax,%eax 61d8: 75 80 jne 615a 61da: 48 8d 35 44 1e 00 00 lea 0x1e44(%rip),%rsi # 8025 61e1: f3 0f 10 05 9f 5e 00 movss 0x5e9f(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 61e8: 00 61e9: 4c 89 f7 mov %r14,%rdi 61ec: e8 4f 48 00 00 callq aa40 61f1: bf 01 00 00 00 mov $0x1,%edi 61f6: b8 01 00 00 00 mov $0x1,%eax 61fb: f3 0f 10 3d 85 5e 00 movss 0x5e85(%rip),%xmm7 # c088 <_IO_stdin_used+0x1088> 6202: 00 6203: 48 8d 35 e6 56 00 00 lea 0x56e6(%rip),%rsi # b8f0 <_IO_stdin_used+0x8f0> 620a: f3 0f 11 7c 24 0c movss %xmm7,0xc(%rsp) 6210: f3 0f 5e f8 divss %xmm0,%xmm7 6214: 66 0f ef c0 pxor %xmm0,%xmm0 6218: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 621c: e8 5f ae ff ff callq 1080 <__printf_chk@plt> 6221: 41 83 fc 01 cmp $0x1,%r12d 6225: 0f 8f a4 d2 ff ff jg 34cf 622b: e9 16 c7 ff ff jmpq 2946 6230: ba 0d 00 00 00 mov $0xd,%edx 6235: 48 8d 35 f6 4e 00 00 lea 0x4ef6(%rip),%rsi # b132 <_IO_stdin_used+0x132> 623c: 4c 89 ef mov %r13,%rdi 623f: e8 ec ad ff ff callq 1030 6244: 85 c0 test %eax,%eax 6246: 0f 85 76 ff ff ff jne 61c2 624c: 48 8d 35 76 1f 00 00 lea 0x1f76(%rip),%rsi # 81c9 6253: f3 0f 10 05 2d 5e 00 movss 0x5e2d(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 625a: 00 625b: 4c 89 f7 mov %r14,%rdi 625e: e8 dd 47 00 00 callq aa40 6263: bf 01 00 00 00 mov $0x1,%edi 6268: b8 01 00 00 00 mov $0x1,%eax 626d: f3 0f 10 3d 13 5e 00 movss 0x5e13(%rip),%xmm7 # c088 <_IO_stdin_used+0x1088> 6274: 00 6275: 48 8d 35 44 56 00 00 lea 0x5644(%rip),%rsi # b8c0 <_IO_stdin_used+0x8c0> 627c: f3 0f 11 7c 24 0c movss %xmm7,0xc(%rsp) 6282: f3 0f 5e f8 divss %xmm0,%xmm7 6286: 66 0f ef c0 pxor %xmm0,%xmm0 628a: f3 0f 5a c7 cvtss2sd %xmm7,%xmm0 628e: e8 ed ad ff ff callq 1080 <__printf_chk@plt> 6293: 41 83 fc 01 cmp $0x1,%r12d 6297: 0f 8f d3 d1 ff ff jg 3470 629d: e9 a4 c6 ff ff jmpq 2946 62a2: ba 09 00 00 00 mov $0x9,%edx 62a7: 48 8d 35 a2 4e 00 00 lea 0x4ea2(%rip),%rsi # b150 <_IO_stdin_used+0x150> 62ae: 4c 89 ef mov %r13,%rdi 62b1: e8 7a ad ff ff callq 1030 62b6: 85 c0 test %eax,%eax 62b8: 0f 85 72 ff ff ff jne 6230 62be: 48 8d 35 0a 17 00 00 lea 0x170a(%rip),%rsi # 79cf 62c5: f3 0f 10 05 bb 5d 00 movss 0x5dbb(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 62cc: 00 62cd: bf 00 2f 68 59 mov $0x59682f00,%edi 62d2: e8 69 47 00 00 callq aa40 62d7: bf 01 00 00 00 mov $0x1,%edi 62dc: b8 01 00 00 00 mov $0x1,%eax 62e1: 48 8d 35 a8 55 00 00 lea 0x55a8(%rip),%rsi # b890 <_IO_stdin_used+0x890> 62e8: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 62ec: e8 8f ad ff ff callq 1080 <__printf_chk@plt> 62f1: 41 83 fc 01 cmp $0x1,%r12d 62f5: 0f 8f 0e d1 ff ff jg 3409 62fb: e9 46 c6 ff ff jmpq 2946 6300: ba 09 00 00 00 mov $0x9,%edx 6305: 48 8d 35 37 4e 00 00 lea 0x4e37(%rip),%rsi # b143 <_IO_stdin_used+0x143> 630c: 4c 89 ef mov %r13,%rdi 630f: e8 1c ad ff ff callq 1030 6314: 85 c0 test %eax,%eax 6316: 75 8a jne 62a2 6318: 48 8d 35 f2 15 00 00 lea 0x15f2(%rip),%rsi # 7911 631f: f3 0f 10 05 61 5d 00 movss 0x5d61(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6326: 00 6327: bf 00 2f 68 59 mov $0x59682f00,%edi 632c: e8 0f 47 00 00 callq aa40 6331: bf 01 00 00 00 mov $0x1,%edi 6336: b8 01 00 00 00 mov $0x1,%eax 633b: 48 8d 35 26 55 00 00 lea 0x5526(%rip),%rsi # b868 <_IO_stdin_used+0x868> 6342: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 6346: e8 35 ad ff ff callq 1080 <__printf_chk@plt> 634b: 41 83 fc 01 cmp $0x1,%r12d 634f: 0f 8f 61 d0 ff ff jg 33b6 6355: e9 ec c5 ff ff jmpq 2946 635a: ba 0c 00 00 00 mov $0xc,%edx 635f: 48 8d 35 bf 4d 00 00 lea 0x4dbf(%rip),%rsi # b125 <_IO_stdin_used+0x125> 6366: 4c 89 ef mov %r13,%rdi 6369: e8 c2 ac ff ff callq 1030 636e: 85 c0 test %eax,%eax 6370: 75 8e jne 6300 6372: 48 8d 35 37 1b 00 00 lea 0x1b37(%rip),%rsi # 7eb0 6379: f3 0f 10 05 07 5d 00 movss 0x5d07(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6380: 00 6381: 4c 89 f7 mov %r14,%rdi 6384: e8 b7 46 00 00 callq aa40 6389: bf 01 00 00 00 mov $0x1,%edi 638e: b8 01 00 00 00 mov $0x1,%eax 6393: f3 0f 10 0d ed 5c 00 movss 0x5ced(%rip),%xmm1 # c088 <_IO_stdin_used+0x1088> 639a: 00 639b: 48 8d 35 9e 54 00 00 lea 0x549e(%rip),%rsi # b840 <_IO_stdin_used+0x840> 63a2: f3 0f 5e c8 divss %xmm0,%xmm1 63a6: 66 0f ef c0 pxor %xmm0,%xmm0 63aa: f3 0f 5a c1 cvtss2sd %xmm1,%xmm0 63ae: e8 cd ac ff ff callq 1080 <__printf_chk@plt> 63b3: 41 83 fc 01 cmp $0x1,%r12d 63b7: 0f 8f a6 cf ff ff jg 3363 63bd: e9 84 c5 ff ff jmpq 2946 63c2: ba 09 00 00 00 mov $0x9,%edx 63c7: 48 8d 35 5a 4d 00 00 lea 0x4d5a(%rip),%rsi # b128 <_IO_stdin_used+0x128> 63ce: 4c 89 ef mov %r13,%rdi 63d1: e8 5a ac ff ff callq 1030 63d6: 85 c0 test %eax,%eax 63d8: 75 80 jne 635a 63da: 48 8d 35 ac 16 00 00 lea 0x16ac(%rip),%rsi # 7a8d 63e1: f3 0f 10 05 9f 5c 00 movss 0x5c9f(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 63e8: 00 63e9: bf 00 2f 68 59 mov $0x59682f00,%edi 63ee: e8 4d 46 00 00 callq aa40 63f3: bf 01 00 00 00 mov $0x1,%edi 63f8: b8 01 00 00 00 mov $0x1,%eax 63fd: 48 8d 35 1c 54 00 00 lea 0x541c(%rip),%rsi # b820 <_IO_stdin_used+0x820> 6404: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 6408: e8 73 ac ff ff callq 1080 <__printf_chk@plt> 640d: 41 83 fc 01 cmp $0x1,%r12d 6411: 0f 8f eb ce ff ff jg 3302 6417: e9 2a c5 ff ff jmpq 2946 641c: ba 0d 00 00 00 mov $0xd,%edx 6421: 48 8d 35 ef 4c 00 00 lea 0x4cef(%rip),%rsi # b117 <_IO_stdin_used+0x117> 6428: 4c 89 ef mov %r13,%rdi 642b: e8 00 ac ff ff callq 1030 6430: 85 c0 test %eax,%eax 6432: 75 8e jne 63c2 6434: 48 8d 35 8e 30 00 00 lea 0x308e(%rip),%rsi # 94c9 643b: f3 0f 10 05 45 5c 00 movss 0x5c45(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 6442: 00 6443: bf 05 7a d7 03 mov $0x3d77a05,%edi 6448: 48 c1 e7 09 shl $0x9,%rdi 644c: e8 ef 45 00 00 callq aa40 6451: bf 01 00 00 00 mov $0x1,%edi 6456: b8 01 00 00 00 mov $0x1,%eax 645b: 48 8d 35 8e 53 00 00 lea 0x538e(%rip),%rsi # b7f0 <_IO_stdin_used+0x7f0> 6462: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 6466: e8 15 ac ff ff callq 1080 <__printf_chk@plt> 646b: 41 83 fc 01 cmp $0x1,%r12d 646f: 0f 8f 3a ce ff ff jg 32af 6475: e9 cc c4 ff ff jmpq 2946 647a: 4c 89 f0 mov %r14,%rax 647d: 66 0f ef c9 pxor %xmm1,%xmm1 6481: 48 d1 e8 shr %rax 6484: f3 48 0f 2a c8 cvtsi2ss %rax,%xmm1 6489: f3 0f 58 c9 addss %xmm1,%xmm1 648d: e9 24 f5 ff ff jmpq 59b6 6492: 48 89 c8 mov %rcx,%rax 6495: 83 e1 01 and $0x1,%ecx 6498: 66 0f ef c0 pxor %xmm0,%xmm0 649c: 48 d1 e8 shr %rax 649f: 48 09 c8 or %rcx,%rax 64a2: f3 48 0f 2a c0 cvtsi2ss %rax,%xmm0 64a7: f3 0f 58 c0 addss %xmm0,%xmm0 64ab: e9 e8 f4 ff ff jmpq 5998 64b0: 41 bd 01 00 00 00 mov $0x1,%r13d 64b6: e9 80 f4 ff ff jmpq 593b 64bb: 48 8d 35 b6 08 00 00 lea 0x8b6(%rip),%rsi # 6d78 64c2: f3 0f 10 05 be 5b 00 movss 0x5bbe(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> 64c9: 00 64ca: 4c 89 f7 mov %r14,%rdi 64cd: e8 6e 45 00 00 callq aa40 64d2: bf 01 00 00 00 mov $0x1,%edi 64d7: b8 01 00 00 00 mov $0x1,%eax 64dc: 48 8d 35 69 52 00 00 lea 0x5269(%rip),%rsi # b74c <_IO_stdin_used+0x74c> 64e3: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 64e7: e8 94 ab ff ff callq 1080 <__printf_chk@plt> 64ec: 41 83 fc 01 cmp $0x1,%r12d 64f0: 0f 8e 50 c4 ff ff jle 2946 64f6: f3 0f 10 3d 8a 5b 00 movss 0x5b8a(%rip),%xmm7 # c088 <_IO_stdin_used+0x1088> 64fd: 00 64fe: f3 0f 11 7c 24 0c movss %xmm7,0xc(%rsp) 6504: f3 0f 11 7c 24 08 movss %xmm7,0x8(%rsp) 650a: e9 18 ad ff ff jmpq 1227 650f: f3 0f 10 2d 71 5b 00 movss 0x5b71(%rip),%xmm5 # c088 <_IO_stdin_used+0x1088> 6516: 00 6517: f3 0f 11 6c 24 0c movss %xmm5,0xc(%rsp) 651d: f3 0f 11 6c 24 08 movss %xmm5,0x8(%rsp) 6523: e9 b4 ac ff ff jmpq 11dc 6528: e8 13 ab ff ff callq 1040 <__stack_chk_fail@plt> 652d: 0f 1f 00 nopl (%rax) 0000000000006530 : 6530: 53 push %rbx 6531: 41 89 d1 mov %edx,%r9d 6534: f7 c7 00 00 00 08 test $0x8000000,%edi 653a: 74 13 je 654f 653c: 31 c9 xor %ecx,%ecx 653e: 0f 01 d0 xgetbv 6541: 89 c2 mov %eax,%edx 6543: 83 e2 06 and $0x6,%edx 6546: 83 fa 06 cmp $0x6,%edx 6549: 0f 84 4a 02 00 00 je 6799 654f: 45 31 db xor %r11d,%r11d 6552: 45 31 d2 xor %r10d,%r10d 6555: 41 89 f0 mov %esi,%r8d 6558: 41 c1 e8 0f shr $0xf,%r8d 655c: 41 83 e0 01 and $0x1,%r8d 6560: 44 89 c0 mov %r8d,%eax 6563: 83 c8 02 or $0x2,%eax 6566: f7 c6 00 00 80 00 test $0x800000,%esi 656c: 44 0f 45 c0 cmovne %eax,%r8d 6570: 44 89 c0 mov %r8d,%eax 6573: 83 c8 08 or $0x8,%eax 6576: f7 c6 00 00 00 02 test $0x2000000,%esi 657c: 44 0f 45 c0 cmovne %eax,%r8d 6580: 44 89 c0 mov %r8d,%eax 6583: 83 c8 10 or $0x10,%eax 6586: 81 e6 00 00 00 04 and $0x4000000,%esi 658c: 44 0f 45 c0 cmovne %eax,%r8d 6590: 44 89 c0 mov %r8d,%eax 6593: 83 c8 04 or $0x4,%eax 6596: f7 c7 00 00 80 00 test $0x800000,%edi 659c: 44 0f 45 c0 cmovne %eax,%r8d 65a0: 44 89 c0 mov %r8d,%eax 65a3: 0d 00 00 04 00 or $0x40000,%eax 65a8: f7 c7 00 00 00 02 test $0x2000000,%edi 65ae: 44 0f 45 c0 cmovne %eax,%r8d 65b2: 44 89 c0 mov %r8d,%eax 65b5: 0d 00 00 08 00 or $0x80000,%eax 65ba: 40 f6 c7 02 test $0x2,%dil 65be: 44 0f 45 c0 cmovne %eax,%r8d 65c2: 44 89 c0 mov %r8d,%eax 65c5: 83 c8 20 or $0x20,%eax 65c8: 40 f6 c7 01 test $0x1,%dil 65cc: 44 0f 45 c0 cmovne %eax,%r8d 65d0: 44 89 c0 mov %r8d,%eax 65d3: 83 c8 40 or $0x40,%eax 65d6: f7 c7 00 02 00 00 test $0x200,%edi 65dc: 44 0f 45 c0 cmovne %eax,%r8d 65e0: 44 89 c0 mov %r8d,%eax 65e3: 0c 80 or $0x80,%al 65e5: f7 c7 00 00 08 00 test $0x80000,%edi 65eb: 44 0f 45 c0 cmovne %eax,%r8d 65ef: 44 89 c0 mov %r8d,%eax 65f2: 80 cc 01 or $0x1,%ah 65f5: f7 c7 00 00 10 00 test $0x100000,%edi 65fb: 44 0f 45 c0 cmovne %eax,%r8d 65ff: 45 85 d2 test %r10d,%r10d 6602: 74 1b je 661f 6604: f7 c7 00 00 00 10 test $0x10000000,%edi 660a: 74 07 je 6613 660c: 41 81 c8 00 02 00 00 or $0x200,%r8d 6613: 81 e7 00 10 00 00 and $0x1000,%edi 6619: 0f 85 6e 01 00 00 jne 678d 661f: 31 f6 xor %esi,%esi 6621: 41 83 f9 06 cmp $0x6,%r9d 6625: 7e 50 jle 6677 6627: b8 07 00 00 00 mov $0x7,%eax 662c: 89 f1 mov %esi,%ecx 662e: 0f a2 cpuid 6630: f6 c3 08 test $0x8,%bl 6633: 74 07 je 663c 6635: 41 81 c8 00 00 01 00 or $0x10000,%r8d 663c: 31 f6 xor %esi,%esi 663e: 45 85 d2 test %r10d,%r10d 6641: 74 1b je 665e 6643: f6 c3 20 test $0x20,%bl 6646: 74 07 je 664f 6648: 41 81 c8 00 04 00 00 or $0x400,%r8d 664f: 89 ce mov %ecx,%esi 6651: 81 e6 00 04 00 00 and $0x400,%esi 6657: 74 05 je 665e 6659: be 02 00 00 00 mov $0x2,%esi 665e: f6 c7 01 test $0x1,%bh 6661: 74 07 je 666a 6663: 41 81 c8 00 00 02 00 or $0x20000,%r8d 666a: f6 c5 01 test $0x1,%ch 666d: 74 03 je 6672 666f: 83 ce 01 or $0x1,%esi 6672: 45 85 db test %r11d,%r11d 6675: 75 50 jne 66c7 6677: b8 00 00 00 80 mov $0x80000000,%eax 667c: 0f a2 cpuid 667e: 3d 00 00 00 80 cmp $0x80000000,%eax 6683: 76 33 jbe 66b8 6685: b8 01 00 00 80 mov $0x80000001,%eax 668a: 0f a2 cpuid 668c: f6 c1 40 test $0x40,%cl 668f: 74 07 je 6698 6691: 41 81 c8 00 08 00 00 or $0x800,%r8d 6698: 45 85 d2 test %r10d,%r10d 669b: 74 1b je 66b8 669d: f7 c1 00 00 01 00 test $0x10000,%ecx 66a3: 74 07 je 66ac 66a5: 41 81 c8 00 10 00 00 or $0x1000,%r8d 66ac: 80 e5 08 and $0x8,%ch 66af: 74 07 je 66b8 66b1: 41 81 c8 00 20 00 00 or $0x2000,%r8d 66b8: 44 89 05 3d 7a 00 00 mov %r8d,0x7a3d(%rip) # e0fc <__cpu_model+0xc> 66bf: 5b pop %rbx 66c0: 89 35 42 7a 00 00 mov %esi,0x7a42(%rip) # e108 <__cpu_features2> 66c6: c3 retq 66c7: f7 c3 00 00 01 00 test $0x10000,%ebx 66cd: 74 07 je 66d6 66cf: 41 81 c8 00 80 00 00 or $0x8000,%r8d 66d6: 85 db test %ebx,%ebx 66d8: 0f 88 d7 00 00 00 js 67b5 66de: f7 c3 00 00 00 40 test $0x40000000,%ebx 66e4: 74 07 je 66ed 66e6: 41 81 c8 00 00 20 00 or $0x200000,%r8d 66ed: f7 c3 00 00 02 00 test $0x20000,%ebx 66f3: 74 07 je 66fc 66f5: 41 81 c8 00 00 40 00 or $0x400000,%r8d 66fc: f7 c3 00 00 00 10 test $0x10000000,%ebx 6702: 74 07 je 670b 6704: 41 81 c8 00 00 80 00 or $0x800000,%r8d 670b: f7 c3 00 00 00 04 test $0x4000000,%ebx 6711: 74 07 je 671a 6713: 41 81 c8 00 00 00 02 or $0x2000000,%r8d 671a: f7 c3 00 00 00 08 test $0x8000000,%ebx 6720: 74 07 je 6729 6722: 41 81 c8 00 00 00 01 or $0x1000000,%r8d 6729: 81 e3 00 00 20 00 and $0x200000,%ebx 672f: 74 07 je 6738 6731: 41 81 c8 00 00 00 08 or $0x8000000,%r8d 6738: f6 c1 02 test $0x2,%cl 673b: 74 07 je 6744 673d: 41 81 c8 00 00 00 04 or $0x4000000,%r8d 6744: f6 c1 40 test $0x40,%cl 6747: 74 07 je 6750 6749: 41 81 c8 00 00 00 80 or $0x80000000,%r8d 6750: f6 c5 08 test $0x8,%ch 6753: 74 03 je 6758 6755: 83 ce 04 or $0x4,%esi 6758: f6 c5 10 test $0x10,%ch 675b: 74 03 je 6760 675d: 83 ce 08 or $0x8,%esi 6760: 80 e5 40 and $0x40,%ch 6763: 74 07 je 676c 6765: 41 81 c8 00 00 00 40 or $0x40000000,%r8d 676c: f6 c2 04 test $0x4,%dl 676f: 74 07 je 6778 6771: 41 81 c8 00 00 00 10 or $0x10000000,%r8d 6778: 80 e2 08 and $0x8,%dl 677b: 0f 84 f6 fe ff ff je 6677 6781: 41 81 c8 00 00 00 20 or $0x20000000,%r8d 6788: e9 ea fe ff ff jmpq 6677 678d: 41 81 c8 00 40 00 00 or $0x4000,%r8d 6794: e9 86 fe ff ff jmpq 661f 6799: 25 e6 00 00 00 and $0xe6,%eax 679e: 45 31 db xor %r11d,%r11d 67a1: 41 ba 01 00 00 00 mov $0x1,%r10d 67a7: 3d e6 00 00 00 cmp $0xe6,%eax 67ac: 41 0f 94 c3 sete %r11b 67b0: e9 a0 fd ff ff jmpq 6555 67b5: 41 81 c8 00 00 10 00 or $0x100000,%r8d 67bc: e9 1d ff ff ff jmpq 66de 67c1: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) 67c8: 00 00 00 00 67cc: 0f 1f 40 00 nopl 0x0(%rax) 00000000000067d0 <__cpu_indicator_init>: 67d0: f3 0f 1e fa endbr64 67d4: 8b 05 16 79 00 00 mov 0x7916(%rip),%eax # e0f0 <__cpu_model> 67da: 45 31 c9 xor %r9d,%r9d 67dd: 85 c0 test %eax,%eax 67df: 75 78 jne 6859 <__cpu_indicator_init+0x89> 67e1: 53 push %rbx 67e2: 44 89 c8 mov %r9d,%eax 67e5: 0f a2 cpuid 67e7: 85 c0 test %eax,%eax 67e9: 0f 84 cc 00 00 00 je 68bb <__cpu_indicator_init+0xeb> 67ef: 44 89 c8 mov %r9d,%eax 67f2: 0f a2 cpuid 67f4: 41 89 da mov %ebx,%r10d 67f7: 41 89 c0 mov %eax,%r8d 67fa: 85 c0 test %eax,%eax 67fc: 0f 8e b9 00 00 00 jle 68bb <__cpu_indicator_init+0xeb> 6802: 44 89 c8 mov %r9d,%eax 6805: 0f a2 cpuid 6807: 85 c0 test %eax,%eax 6809: 0f 84 ac 00 00 00 je 68bb <__cpu_indicator_init+0xeb> 680f: b8 01 00 00 00 mov $0x1,%eax 6814: 0f a2 cpuid 6816: 89 d6 mov %edx,%esi 6818: 89 cf mov %ecx,%edi 681a: 89 c2 mov %eax,%edx 681c: 89 c1 mov %eax,%ecx 681e: c1 ea 04 shr $0x4,%edx 6821: 41 89 c3 mov %eax,%r11d 6824: c1 e9 08 shr $0x8,%ecx 6827: 41 c1 eb 0c shr $0xc,%r11d 682b: 83 e2 0f and $0xf,%edx 682e: 83 e1 0f and $0xf,%ecx 6831: 41 81 e3 f0 00 00 00 and $0xf0,%r11d 6838: 41 81 fa 47 65 6e 75 cmp $0x756e6547,%r10d 683f: 74 1c je 685d <__cpu_indicator_init+0x8d> 6841: 41 81 fa 41 75 74 68 cmp $0x68747541,%r10d 6848: 74 33 je 687d <__cpu_indicator_init+0xad> 684a: c7 05 9c 78 00 00 03 movl $0x3,0x789c(%rip) # e0f0 <__cpu_model> 6851: 00 00 00 6854: 44 89 c8 mov %r9d,%eax 6857: 5b pop %rbx 6858: c3 retq 6859: 44 89 c8 mov %r9d,%eax 685c: c3 retq 685d: 83 f9 06 cmp $0x6,%ecx 6860: 0f 84 fe 00 00 00 je 6964 <__cpu_indicator_init+0x194> 6866: 44 89 c2 mov %r8d,%edx 6869: e8 c2 fc ff ff callq 6530 686e: 45 31 c9 xor %r9d,%r9d 6871: c7 05 75 78 00 00 01 movl $0x1,0x7875(%rip) # e0f0 <__cpu_model> 6878: 00 00 00 687b: eb d7 jmp 6854 <__cpu_indicator_init+0x84> 687d: 83 f9 0f cmp $0xf,%ecx 6880: 74 17 je 6899 <__cpu_indicator_init+0xc9> 6882: 44 89 c2 mov %r8d,%edx 6885: e8 a6 fc ff ff callq 6530 688a: 45 31 c9 xor %r9d,%r9d 688d: c7 05 59 78 00 00 02 movl $0x2,0x7859(%rip) # e0f0 <__cpu_model> 6894: 00 00 00 6897: eb bb jmp 6854 <__cpu_indicator_init+0x84> 6899: c1 e8 14 shr $0x14,%eax 689c: 44 09 da or %r11d,%edx 689f: 0f b6 c0 movzbl %al,%eax 68a2: 83 e8 01 sub $0x1,%eax 68a5: 83 f8 07 cmp $0x7,%eax 68a8: 77 d8 ja 6882 <__cpu_indicator_init+0xb2> 68aa: 48 8d 0d 0f 58 00 00 lea 0x580f(%rip),%rcx # c0c0 <_IO_stdin_used+0x10c0> 68b1: 48 63 04 81 movslq (%rcx,%rax,4),%rax 68b5: 48 01 c8 add %rcx,%rax 68b8: 3e ff e0 notrack jmpq *%rax 68bb: c7 05 2b 78 00 00 03 movl $0x3,0x782b(%rip) # e0f0 <__cpu_model> 68c2: 00 00 00 68c5: 41 83 c9 ff or $0xffffffff,%r9d 68c9: eb 89 jmp 6854 <__cpu_indicator_init+0x84> 68cb: c7 05 1f 78 00 00 0a movl $0xa,0x781f(%rip) # e0f4 <__cpu_model+0x4> 68d2: 00 00 00 68d5: 83 fa 1f cmp $0x1f,%edx 68d8: 0f 87 05 02 00 00 ja 6ae3 <__cpu_indicator_init+0x313> 68de: c7 05 10 78 00 00 0b movl $0xb,0x7810(%rip) # e0f8 <__cpu_model+0x8> 68e5: 00 00 00 68e8: eb 98 jmp 6882 <__cpu_indicator_init+0xb2> 68ea: c7 05 00 78 00 00 09 movl $0x9,0x7800(%rip) # e0f4 <__cpu_model+0x4> 68f1: 00 00 00 68f4: eb 8c jmp 6882 <__cpu_indicator_init+0xb2> 68f6: c7 05 f4 77 00 00 05 movl $0x5,0x77f4(%rip) # e0f4 <__cpu_model+0x4> 68fd: 00 00 00 6900: 83 fa 02 cmp $0x2,%edx 6903: 0f 84 cb 01 00 00 je 6ad4 <__cpu_indicator_init+0x304> 6909: 83 fa 0f cmp $0xf,%edx 690c: 0f 87 e9 01 00 00 ja 6afb <__cpu_indicator_init+0x32b> 6912: c7 05 dc 77 00 00 07 movl $0x7,0x77dc(%rip) # e0f8 <__cpu_model+0x8> 6919: 00 00 00 691c: e9 61 ff ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6921: c7 05 c9 77 00 00 08 movl $0x8,0x77c9(%rip) # e0f4 <__cpu_model+0x4> 6928: 00 00 00 692b: e9 52 ff ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6930: c7 05 ba 77 00 00 04 movl $0x4,0x77ba(%rip) # e0f4 <__cpu_model+0x4> 6937: 00 00 00 693a: 83 fa 04 cmp $0x4,%edx 693d: 0f 84 e0 01 00 00 je 6b23 <__cpu_indicator_init+0x353> 6943: 83 fa 08 cmp $0x8,%edx 6946: 0f 84 c8 01 00 00 je 6b14 <__cpu_indicator_init+0x344> 694c: 83 fa 02 cmp $0x2,%edx 694f: 0f 85 2d ff ff ff jne 6882 <__cpu_indicator_init+0xb2> 6955: c7 05 99 77 00 00 04 movl $0x4,0x7799(%rip) # e0f8 <__cpu_model+0x8> 695c: 00 00 00 695f: e9 1e ff ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6964: 89 d0 mov %edx,%eax 6966: 44 09 d8 or %r11d,%eax 6969: 84 db test %bl,%bl 696b: 0f 85 f5 fe ff ff jne 6866 <__cpu_indicator_init+0x96> 6971: 83 e8 0f sub $0xf,%eax 6974: 3d 8f 00 00 00 cmp $0x8f,%eax 6979: 0f 87 e7 fe ff ff ja 6866 <__cpu_indicator_init+0x96> 697f: 48 8d 15 5a 57 00 00 lea 0x575a(%rip),%rdx # c0e0 <_IO_stdin_used+0x10e0> 6986: 48 63 04 82 movslq (%rdx,%rax,4),%rax 698a: 48 01 d0 add %rdx,%rax 698d: 3e ff e0 notrack jmpq *%rax 6990: c7 05 5a 77 00 00 0b movl $0xb,0x775a(%rip) # e0f4 <__cpu_model+0x4> 6997: 00 00 00 699a: e9 c7 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 699f: c7 05 4b 77 00 00 0d movl $0xd,0x774b(%rip) # e0f4 <__cpu_model+0x4> 69a6: 00 00 00 69a9: e9 b8 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 69ae: 48 b8 03 00 00 00 11 movabs $0x1100000003,%rax 69b5: 00 00 00 69b8: 48 89 05 35 77 00 00 mov %rax,0x7735(%rip) # e0f4 <__cpu_model+0x4> 69bf: e9 a2 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 69c4: c7 05 26 77 00 00 0c movl $0xc,0x7726(%rip) # e0f4 <__cpu_model+0x4> 69cb: 00 00 00 69ce: e9 93 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 69d3: c7 05 17 77 00 00 07 movl $0x7,0x7717(%rip) # e0f4 <__cpu_model+0x4> 69da: 00 00 00 69dd: e9 84 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 69e2: c7 05 08 77 00 00 03 movl $0x3,0x7708(%rip) # e0f4 <__cpu_model+0x4> 69e9: 00 00 00 69ec: b8 07 00 00 00 mov $0x7,%eax 69f1: 31 c9 xor %ecx,%ecx 69f3: 0f a2 cpuid 69f5: 80 e5 08 and $0x8,%ch 69f8: 0f 84 4c 01 00 00 je 6b4a <__cpu_indicator_init+0x37a> 69fe: c7 05 f0 76 00 00 15 movl $0x15,0x76f0(%rip) # e0f8 <__cpu_model+0x8> 6a05: 00 00 00 6a08: e9 59 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a0d: 48 b8 03 00 00 00 0f movabs $0xf00000003,%rax 6a14: 00 00 00 6a17: 48 89 05 d6 76 00 00 mov %rax,0x76d6(%rip) # e0f4 <__cpu_model+0x4> 6a1e: e9 43 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a23: 48 b8 03 00 00 00 0e movabs $0xe00000003,%rax 6a2a: 00 00 00 6a2d: 48 89 05 c0 76 00 00 mov %rax,0x76c0(%rip) # e0f4 <__cpu_model+0x4> 6a34: e9 2d fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a39: 48 b8 03 00 00 00 0d movabs $0xd00000003,%rax 6a40: 00 00 00 6a43: 48 89 05 aa 76 00 00 mov %rax,0x76aa(%rip) # e0f4 <__cpu_model+0x4> 6a4a: e9 17 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a4f: 48 b8 03 00 00 00 0c movabs $0xc00000003,%rax 6a56: 00 00 00 6a59: 48 89 05 94 76 00 00 mov %rax,0x7694(%rip) # e0f4 <__cpu_model+0x4> 6a60: e9 01 fe ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a65: c7 05 85 76 00 00 06 movl $0x6,0x7685(%rip) # e0f4 <__cpu_model+0x4> 6a6c: 00 00 00 6a6f: e9 f2 fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a74: 48 b8 03 00 00 00 03 movabs $0x300000003,%rax 6a7b: 00 00 00 6a7e: 48 89 05 6f 76 00 00 mov %rax,0x766f(%rip) # e0f4 <__cpu_model+0x4> 6a85: e9 dc fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6a8a: 48 b8 03 00 00 00 02 movabs $0x200000003,%rax 6a91: 00 00 00 6a94: 48 89 05 59 76 00 00 mov %rax,0x7659(%rip) # e0f4 <__cpu_model+0x4> 6a9b: e9 c6 fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6aa0: c7 05 4a 76 00 00 01 movl $0x1,0x764a(%rip) # e0f4 <__cpu_model+0x4> 6aa7: 00 00 00 6aaa: e9 b7 fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6aaf: 48 b8 03 00 00 00 01 movabs $0x100000003,%rax 6ab6: 00 00 00 6ab9: 48 89 05 34 76 00 00 mov %rax,0x7634(%rip) # e0f4 <__cpu_model+0x4> 6ac0: e9 a1 fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6ac5: c7 05 25 76 00 00 02 movl $0x2,0x7625(%rip) # e0f4 <__cpu_model+0x4> 6acc: 00 00 00 6acf: e9 92 fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6ad4: c7 05 1a 76 00 00 08 movl $0x8,0x761a(%rip) # e0f8 <__cpu_model+0x8> 6adb: 00 00 00 6ade: e9 9f fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6ae3: 83 fa 2f cmp $0x2f,%edx 6ae6: 0f 86 96 fd ff ff jbe 6882 <__cpu_indicator_init+0xb2> 6aec: c7 05 02 76 00 00 14 movl $0x14,0x7602(%rip) # e0f8 <__cpu_model+0x8> 6af3: 00 00 00 6af6: e9 87 fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6afb: 83 fa 2f cmp $0x2f,%edx 6afe: 76 d4 jbe 6ad4 <__cpu_indicator_init+0x304> 6b00: 83 fa 4f cmp $0x4f,%edx 6b03: 77 2d ja 6b32 <__cpu_indicator_init+0x362> 6b05: c7 05 e9 75 00 00 09 movl $0x9,0x75e9(%rip) # e0f8 <__cpu_model+0x8> 6b0c: 00 00 00 6b0f: e9 6e fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6b14: c7 05 da 75 00 00 06 movl $0x6,0x75da(%rip) # e0f8 <__cpu_model+0x8> 6b1b: 00 00 00 6b1e: e9 5f fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6b23: c7 05 cb 75 00 00 05 movl $0x5,0x75cb(%rip) # e0f8 <__cpu_model+0x8> 6b2a: 00 00 00 6b2d: e9 50 fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6b32: 83 fa 7f cmp $0x7f,%edx 6b35: 0f 87 47 fd ff ff ja 6882 <__cpu_indicator_init+0xb2> 6b3b: c7 05 b3 75 00 00 0a movl $0xa,0x75b3(%rip) # e0f8 <__cpu_model+0x8> 6b42: 00 00 00 6b45: e9 38 fd ff ff jmpq 6882 <__cpu_indicator_init+0xb2> 6b4a: c7 05 a4 75 00 00 10 movl $0x10,0x75a4(%rip) # e0f8 <__cpu_model+0x8> 6b51: 00 00 00 6b54: e9 0d fd ff ff jmpq 6866 <__cpu_indicator_init+0x96> 6b59: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 0000000000006b60 <_start>: 6b60: f3 0f 1e fa endbr64 6b64: 31 ed xor %ebp,%ebp 6b66: 49 89 d1 mov %rdx,%r9 6b69: 5e pop %rsi 6b6a: 48 89 e2 mov %rsp,%rdx 6b6d: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp 6b71: 50 push %rax 6b72: 54 push %rsp 6b73: 4c 8d 05 66 40 00 00 lea 0x4066(%rip),%r8 # abe0 <__libc_csu_fini> 6b7a: 48 8d 0d ef 3f 00 00 lea 0x3fef(%rip),%rcx # ab70 <__libc_csu_init> 6b81: 48 8d 3d 38 a5 ff ff lea -0x5ac8(%rip),%rdi # 10c0
6b88: ff 15 52 74 00 00 callq *0x7452(%rip) # dfe0 <__libc_start_main@GLIBC_2.2.5> 6b8e: f4 hlt 6b8f: 90 nop 0000000000006b90 : 6b90: 48 8d 3d 49 75 00 00 lea 0x7549(%rip),%rdi # e0e0 6b97: 48 8d 05 42 75 00 00 lea 0x7542(%rip),%rax # e0e0 6b9e: 48 39 f8 cmp %rdi,%rax 6ba1: 74 15 je 6bb8 6ba3: 48 8b 05 2e 74 00 00 mov 0x742e(%rip),%rax # dfd8 <_ITM_deregisterTMCloneTable> 6baa: 48 85 c0 test %rax,%rax 6bad: 74 09 je 6bb8 6baf: ff e0 jmpq *%rax 6bb1: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 6bb8: c3 retq 6bb9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 0000000000006bc0 : 6bc0: 48 8d 3d 19 75 00 00 lea 0x7519(%rip),%rdi # e0e0 6bc7: 48 8d 35 12 75 00 00 lea 0x7512(%rip),%rsi # e0e0 6bce: 48 29 fe sub %rdi,%rsi 6bd1: 48 89 f0 mov %rsi,%rax 6bd4: 48 c1 ee 3f shr $0x3f,%rsi 6bd8: 48 c1 f8 03 sar $0x3,%rax 6bdc: 48 01 c6 add %rax,%rsi 6bdf: 48 d1 fe sar %rsi 6be2: 74 14 je 6bf8 6be4: 48 8b 05 05 74 00 00 mov 0x7405(%rip),%rax # dff0 <_ITM_registerTMCloneTable> 6beb: 48 85 c0 test %rax,%rax 6bee: 74 08 je 6bf8 6bf0: ff e0 jmpq *%rax 6bf2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 6bf8: c3 retq 6bf9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 0000000000006c00 <__do_global_dtors_aux>: 6c00: f3 0f 1e fa endbr64 6c04: 80 3d dd 74 00 00 00 cmpb $0x0,0x74dd(%rip) # e0e8 6c0b: 75 2b jne 6c38 <__do_global_dtors_aux+0x38> 6c0d: 55 push %rbp 6c0e: 48 83 3d e2 73 00 00 cmpq $0x0,0x73e2(%rip) # dff8 <__cxa_finalize@GLIBC_2.2.5> 6c15: 00 6c16: 48 89 e5 mov %rsp,%rbp 6c19: 74 0c je 6c27 <__do_global_dtors_aux+0x27> 6c1b: 48 8b 3d e6 73 00 00 mov 0x73e6(%rip),%rdi # e008 <__dso_handle> 6c22: e8 89 a4 ff ff callq 10b0 <__cxa_finalize@plt> 6c27: e8 64 ff ff ff callq 6b90 6c2c: c6 05 b5 74 00 00 01 movb $0x1,0x74b5(%rip) # e0e8 6c33: 5d pop %rbp 6c34: c3 retq 6c35: 0f 1f 00 nopl (%rax) 6c38: c3 retq 6c39: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 0000000000006c40 : 6c40: f3 0f 1e fa endbr64 6c44: e9 77 ff ff ff jmpq 6bc0 0000000000006c49 : 6c49: 53 push %rbx 6c4a: 41 50 push %r8 6c4c: 41 51 push %r9 6c4e: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6c55: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6c5c: 48 31 db xor %rbx,%rbx 0000000000006c5f : 6c5f: 4c 01 c3 add %r8,%rbx 6c62: 4c 01 c3 add %r8,%rbx 6c65: 4c 01 c3 add %r8,%rbx 6c68: 4c 01 c3 add %r8,%rbx 6c6b: 4c 01 c3 add %r8,%rbx 6c6e: 4c 01 c3 add %r8,%rbx 6c71: 4c 01 c3 add %r8,%rbx 6c74: 4c 01 c3 add %r8,%rbx 6c77: 4c 01 c3 add %r8,%rbx 6c7a: 4c 01 c3 add %r8,%rbx 6c7d: 4c 01 c3 add %r8,%rbx 6c80: 4c 01 c3 add %r8,%rbx 6c83: 4c 01 c3 add %r8,%rbx 6c86: 4c 01 c3 add %r8,%rbx 6c89: 4c 01 c3 add %r8,%rbx 6c8c: 4c 01 c3 add %r8,%rbx 6c8f: 4c 01 c3 add %r8,%rbx 6c92: 4c 01 c3 add %r8,%rbx 6c95: 4c 01 c3 add %r8,%rbx 6c98: 4c 01 c3 add %r8,%rbx 6c9b: 4c 29 cf sub %r9,%rdi 6c9e: 75 bf jne 6c5f 6ca0: 41 59 pop %r9 6ca2: 41 58 pop %r8 6ca4: 5b pop %rbx 6ca5: c3 retq 0000000000006ca6 : 6ca6: 53 push %rbx 6ca7: 41 50 push %r8 6ca9: 41 51 push %r9 6cab: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6cb2: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6cb9: 48 31 db xor %rbx,%rbx 0000000000006cbc : 6cbc: 4c 01 c3 add %r8,%rbx 6cbf: 49 89 d8 mov %rbx,%r8 6cc2: 4c 01 c3 add %r8,%rbx 6cc5: 49 89 d8 mov %rbx,%r8 6cc8: 4c 01 c3 add %r8,%rbx 6ccb: 49 89 d8 mov %rbx,%r8 6cce: 4c 01 c3 add %r8,%rbx 6cd1: 49 89 d8 mov %rbx,%r8 6cd4: 4c 01 c3 add %r8,%rbx 6cd7: 49 89 d8 mov %rbx,%r8 6cda: 4c 01 c3 add %r8,%rbx 6cdd: 49 89 d8 mov %rbx,%r8 6ce0: 4c 01 c3 add %r8,%rbx 6ce3: 49 89 d8 mov %rbx,%r8 6ce6: 4c 01 c3 add %r8,%rbx 6ce9: 49 89 d8 mov %rbx,%r8 6cec: 4c 01 c3 add %r8,%rbx 6cef: 49 89 d8 mov %rbx,%r8 6cf2: 4c 01 c3 add %r8,%rbx 6cf5: 49 89 d8 mov %rbx,%r8 6cf8: 4c 01 c3 add %r8,%rbx 6cfb: 49 89 d8 mov %rbx,%r8 6cfe: 4c 01 c3 add %r8,%rbx 6d01: 49 89 d8 mov %rbx,%r8 6d04: 4c 01 c3 add %r8,%rbx 6d07: 49 89 d8 mov %rbx,%r8 6d0a: 4c 01 c3 add %r8,%rbx 6d0d: 49 89 d8 mov %rbx,%r8 6d10: 4c 01 c3 add %r8,%rbx 6d13: 49 89 d8 mov %rbx,%r8 6d16: 4c 01 c3 add %r8,%rbx 6d19: 49 89 d8 mov %rbx,%r8 6d1c: 4c 01 c3 add %r8,%rbx 6d1f: 49 89 d8 mov %rbx,%r8 6d22: 4c 01 c3 add %r8,%rbx 6d25: 49 89 d8 mov %rbx,%r8 6d28: 4c 01 c3 add %r8,%rbx 6d2b: 49 89 d8 mov %rbx,%r8 6d2e: 4c 01 c3 add %r8,%rbx 6d31: 49 89 d8 mov %rbx,%r8 6d34: 4c 29 cf sub %r9,%rdi 6d37: 75 83 jne 6cbc 6d39: 41 59 pop %r9 6d3b: 41 58 pop %r8 6d3d: 5b pop %rbx 6d3e: c3 retq 0000000000006d3f : 6d3f: 53 push %rbx 6d40: 41 51 push %r9 6d42: 49 c7 c1 14 00 00 00 mov $0x14,%r9 0000000000006d49 : 6d49: 66 90 xchg %ax,%ax 6d4b: 66 90 xchg %ax,%ax 6d4d: 66 90 xchg %ax,%ax 6d4f: 66 90 xchg %ax,%ax 6d51: 66 90 xchg %ax,%ax 6d53: 66 90 xchg %ax,%ax 6d55: 66 90 xchg %ax,%ax 6d57: 66 90 xchg %ax,%ax 6d59: 66 90 xchg %ax,%ax 6d5b: 66 90 xchg %ax,%ax 6d5d: 66 90 xchg %ax,%ax 6d5f: 66 90 xchg %ax,%ax 6d61: 66 90 xchg %ax,%ax 6d63: 66 90 xchg %ax,%ax 6d65: 66 90 xchg %ax,%ax 6d67: 66 90 xchg %ax,%ax 6d69: 66 90 xchg %ax,%ax 6d6b: 66 90 xchg %ax,%ax 6d6d: 66 90 xchg %ax,%ax 6d6f: 4c 29 cf sub %r9,%rdi 6d72: 75 d5 jne 6d49 6d74: 41 59 pop %r9 6d76: 5b pop %rbx 6d77: c3 retq 0000000000006d78 : 6d78: 53 push %rbx 6d79: 41 51 push %r9 6d7b: 49 c7 c1 14 00 00 00 mov $0x14,%r9 0000000000006d82 : 6d82: 90 nop 6d83: 90 nop 6d84: 90 nop 6d85: 90 nop 6d86: 90 nop 6d87: 90 nop 6d88: 90 nop 6d89: 90 nop 6d8a: 90 nop 6d8b: 90 nop 6d8c: 90 nop 6d8d: 90 nop 6d8e: 90 nop 6d8f: 90 nop 6d90: 90 nop 6d91: 90 nop 6d92: 90 nop 6d93: 90 nop 6d94: 90 nop 6d95: 4c 29 cf sub %r9,%rdi 6d98: 75 e8 jne 6d82 6d9a: 41 59 pop %r9 6d9c: 5b pop %rbx 6d9d: c3 retq 0000000000006d9e : 6d9e: 53 push %rbx 6d9f: 51 push %rcx 6da0: 41 50 push %r8 6da2: 41 51 push %r9 6da4: 41 52 push %r10 6da6: 41 53 push %r11 6da8: 41 54 push %r12 6daa: 41 55 push %r13 6dac: 41 56 push %r14 6dae: 41 57 push %r15 6db0: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6db7: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6dbe: 48 31 db xor %rbx,%rbx 6dc1: 48 31 c9 xor %rcx,%rcx 6dc4: 4d 31 d2 xor %r10,%r10 6dc7: 4d 31 db xor %r11,%r11 6dca: 4d 31 e4 xor %r12,%r12 6dcd: 4d 31 ed xor %r13,%r13 6dd0: 4d 31 f6 xor %r14,%r14 6dd3: 4d 31 ff xor %r15,%r15 0000000000006dd6 : 6dd6: 4d 01 c7 add %r8,%r15 6dd9: 4d 01 c6 add %r8,%r14 6ddc: 4d 01 c5 add %r8,%r13 6ddf: 4d 01 c4 add %r8,%r12 6de2: 4d 01 c3 add %r8,%r11 6de5: 4d 01 c2 add %r8,%r10 6de8: 4c 01 c1 add %r8,%rcx 6deb: 4d 01 c7 add %r8,%r15 6dee: 4d 01 c6 add %r8,%r14 6df1: 4d 01 c5 add %r8,%r13 6df4: 4d 01 c4 add %r8,%r12 6df7: 4d 01 c3 add %r8,%r11 6dfa: 4d 01 c2 add %r8,%r10 6dfd: 4c 01 c1 add %r8,%rcx 6e00: 4d 01 c7 add %r8,%r15 6e03: 4d 01 c6 add %r8,%r14 6e06: 4d 01 c5 add %r8,%r13 6e09: 4d 01 c4 add %r8,%r12 6e0c: 4d 01 c3 add %r8,%r11 6e0f: 4d 01 c2 add %r8,%r10 6e12: 4c 29 cf sub %r9,%rdi 6e15: 75 bf jne 6dd6 6e17: 41 5f pop %r15 6e19: 41 5e pop %r14 6e1b: 41 5d pop %r13 6e1d: 41 5c pop %r12 6e1f: 41 5b pop %r11 6e21: 41 5a pop %r10 6e23: 41 59 pop %r9 6e25: 41 58 pop %r8 6e27: 59 pop %rcx 6e28: 5b pop %rbx 6e29: c3 retq 0000000000006e2a : 6e2a: 53 push %rbx 6e2b: 51 push %rcx 6e2c: 41 50 push %r8 6e2e: 41 51 push %r9 6e30: 41 52 push %r10 6e32: 41 53 push %r11 6e34: 41 54 push %r12 6e36: 41 55 push %r13 6e38: 41 56 push %r14 6e3a: 41 57 push %r15 6e3c: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6e43: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6e4a: 48 31 db xor %rbx,%rbx 6e4d: 48 31 c9 xor %rcx,%rcx 6e50: 4d 31 d2 xor %r10,%r10 6e53: 4d 31 db xor %r11,%r11 6e56: 4d 31 e4 xor %r12,%r12 6e59: 4d 31 ed xor %r13,%r13 6e5c: 4d 31 f6 xor %r14,%r14 6e5f: 4d 31 ff xor %r15,%r15 0000000000006e62 : 6e62: 4d 01 c7 add %r8,%r15 6e65: 4d 01 c6 add %r8,%r14 6e68: 4d 01 c5 add %r8,%r13 6e6b: 4d 01 c4 add %r8,%r12 6e6e: 90 nop 6e6f: 4d 01 c2 add %r8,%r10 6e72: 4c 01 c1 add %r8,%rcx 6e75: 4d 01 c7 add %r8,%r15 6e78: 4d 01 c6 add %r8,%r14 6e7b: 90 nop 6e7c: 4d 01 c4 add %r8,%r12 6e7f: 4d 01 c3 add %r8,%r11 6e82: 4d 01 c2 add %r8,%r10 6e85: 4c 01 c1 add %r8,%rcx 6e88: 90 nop 6e89: 4d 01 c6 add %r8,%r14 6e8c: 4d 01 c5 add %r8,%r13 6e8f: 4d 01 c4 add %r8,%r12 6e92: 4d 01 c3 add %r8,%r11 6e95: 90 nop 6e96: 4c 29 cf sub %r9,%rdi 6e99: 75 c7 jne 6e62 6e9b: 41 5f pop %r15 6e9d: 41 5e pop %r14 6e9f: 41 5d pop %r13 6ea1: 41 5c pop %r12 6ea3: 41 5b pop %r11 6ea5: 41 5a pop %r10 6ea7: 41 59 pop %r9 6ea9: 41 58 pop %r8 6eab: 59 pop %rcx 6eac: 5b pop %rbx 6ead: c3 retq 0000000000006eae : 6eae: 53 push %rbx 6eaf: 51 push %rcx 6eb0: 41 50 push %r8 6eb2: 41 51 push %r9 6eb4: 41 52 push %r10 6eb6: 41 53 push %r11 6eb8: 41 54 push %r12 6eba: 41 55 push %r13 6ebc: 41 56 push %r14 6ebe: 41 57 push %r15 6ec0: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6ec7: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6ece: 48 31 db xor %rbx,%rbx 6ed1: 48 31 c9 xor %rcx,%rcx 6ed4: 4d 31 d2 xor %r10,%r10 6ed7: 4d 31 db xor %r11,%r11 6eda: 4d 31 e4 xor %r12,%r12 6edd: 4d 31 ed xor %r13,%r13 6ee0: 4d 31 f6 xor %r14,%r14 6ee3: 4d 31 ff xor %r15,%r15 0000000000006ee6 : 6ee6: 4d 01 c7 add %r8,%r15 6ee9: 4d 01 c6 add %r8,%r14 6eec: 4d 01 c5 add %r8,%r13 6eef: 4d 01 c4 add %r8,%r12 6ef2: 4c 89 fa mov %r15,%rdx 6ef5: 4d 01 c2 add %r8,%r10 6ef8: 4c 01 c1 add %r8,%rcx 6efb: 4d 01 c7 add %r8,%r15 6efe: 4d 01 c6 add %r8,%r14 6f01: 4c 89 fa mov %r15,%rdx 6f04: 4d 01 c4 add %r8,%r12 6f07: 4d 01 c3 add %r8,%r11 6f0a: 4d 01 c2 add %r8,%r10 6f0d: 4c 01 c1 add %r8,%rcx 6f10: 4c 89 fa mov %r15,%rdx 6f13: 4d 01 c6 add %r8,%r14 6f16: 4d 01 c5 add %r8,%r13 6f19: 4d 01 c4 add %r8,%r12 6f1c: 4d 01 c3 add %r8,%r11 6f1f: 4c 89 fa mov %r15,%rdx 6f22: 4c 29 cf sub %r9,%rdi 6f25: 75 bf jne 6ee6 6f27: 41 5f pop %r15 6f29: 41 5e pop %r14 6f2b: 41 5d pop %r13 6f2d: 41 5c pop %r12 6f2f: 41 5b pop %r11 6f31: 41 5a pop %r10 6f33: 41 59 pop %r9 6f35: 41 58 pop %r8 6f37: 59 pop %rcx 6f38: 5b pop %rbx 6f39: c3 retq 0000000000006f3a : 6f3a: 53 push %rbx 6f3b: 51 push %rcx 6f3c: 41 50 push %r8 6f3e: 41 51 push %r9 6f40: 41 52 push %r10 6f42: 41 53 push %r11 6f44: 41 54 push %r12 6f46: 41 55 push %r13 6f48: 41 56 push %r14 6f4a: 41 57 push %r15 6f4c: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6f53: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6f5a: 4c 89 c3 mov %r8,%rbx 6f5d: 4c 89 c1 mov %r8,%rcx 6f60: 4d 89 c2 mov %r8,%r10 6f63: 4d 89 c3 mov %r8,%r11 6f66: 4d 89 c4 mov %r8,%r12 6f69: 4d 89 c5 mov %r8,%r13 6f6c: 4d 89 c6 mov %r8,%r14 6f6f: 4d 89 c7 mov %r8,%r15 0000000000006f72 : 6f72: 49 d1 cf ror %r15 6f75: 49 d1 ce ror %r14 6f78: 49 d1 cd ror %r13 6f7b: 49 d1 cc ror %r12 6f7e: 49 d1 cb ror %r11 6f81: 49 d1 cf ror %r15 6f84: 49 d1 ce ror %r14 6f87: 49 d1 cd ror %r13 6f8a: 49 d1 cc ror %r12 6f8d: 49 d1 cb ror %r11 6f90: 49 d1 cf ror %r15 6f93: 49 d1 ce ror %r14 6f96: 49 d1 cd ror %r13 6f99: 49 d1 cc ror %r12 6f9c: 49 d1 cb ror %r11 6f9f: 49 d1 cf ror %r15 6fa2: 49 d1 ce ror %r14 6fa5: 49 d1 cd ror %r13 6fa8: 49 d1 cc ror %r12 6fab: 49 d1 cb ror %r11 6fae: 4c 29 cf sub %r9,%rdi 6fb1: 75 bf jne 6f72 6fb3: 41 5f pop %r15 6fb5: 41 5e pop %r14 6fb7: 41 5d pop %r13 6fb9: 41 5c pop %r12 6fbb: 41 5b pop %r11 6fbd: 41 5a pop %r10 6fbf: 41 59 pop %r9 6fc1: 41 58 pop %r8 6fc3: 59 pop %rcx 6fc4: 5b pop %rbx 6fc5: c3 retq 0000000000006fc6 : 6fc6: 53 push %rbx 6fc7: 51 push %rcx 6fc8: 41 50 push %r8 6fca: 41 51 push %r9 6fcc: 41 52 push %r10 6fce: 41 53 push %r11 6fd0: 41 54 push %r12 6fd2: 41 55 push %r13 6fd4: 41 56 push %r14 6fd6: 41 57 push %r15 6fd8: 49 c7 c0 01 00 00 00 mov $0x1,%r8 6fdf: 49 c7 c1 14 00 00 00 mov $0x14,%r9 6fe6: 4c 89 c3 mov %r8,%rbx 6fe9: 4c 89 c1 mov %r8,%rcx 6fec: 4d 89 c2 mov %r8,%r10 6fef: 4d 89 c3 mov %r8,%r11 6ff2: 4d 89 c4 mov %r8,%r12 6ff5: 4d 89 c5 mov %r8,%r13 6ff8: 4d 89 c6 mov %r8,%r14 6ffb: 4d 89 c7 mov %r8,%r15 0000000000006ffe : 6ffe: 49 d1 e7 shl %r15 7001: 49 d1 e6 shl %r14 7004: 49 d1 e5 shl %r13 7007: 49 d1 e4 shl %r12 700a: 49 d1 e3 shl %r11 700d: 49 d1 e7 shl %r15 7010: 49 d1 e6 shl %r14 7013: 49 d1 e5 shl %r13 7016: 49 d1 e4 shl %r12 7019: 49 d1 e3 shl %r11 701c: 49 d1 e7 shl %r15 701f: 49 d1 e6 shl %r14 7022: 49 d1 e5 shl %r13 7025: 49 d1 e4 shl %r12 7028: 49 d1 e3 shl %r11 702b: 49 d1 e7 shl %r15 702e: 49 d1 e6 shl %r14 7031: 49 d1 e5 shl %r13 7034: 49 d1 e4 shl %r12 7037: 49 d1 e3 shl %r11 703a: 4c 29 cf sub %r9,%rdi 703d: 75 bf jne 6ffe 703f: 41 5f pop %r15 7041: 41 5e pop %r14 7043: 41 5d pop %r13 7045: 41 5c pop %r12 7047: 41 5b pop %r11 7049: 41 5a pop %r10 704b: 41 59 pop %r9 704d: 41 58 pop %r8 704f: 59 pop %rcx 7050: 5b pop %rbx 7051: c3 retq 0000000000007052 : 7052: 53 push %rbx 7053: 51 push %rcx 7054: 41 50 push %r8 7056: 41 51 push %r9 7058: 41 52 push %r10 705a: 41 53 push %r11 705c: 41 54 push %r12 705e: 41 55 push %r13 7060: 41 56 push %r14 7062: 41 57 push %r15 7064: 49 c7 c0 01 00 00 00 mov $0x1,%r8 706b: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7072: 4c 89 c3 mov %r8,%rbx 7075: 4c 89 c1 mov %r8,%rcx 7078: 4d 89 c2 mov %r8,%r10 707b: 4d 89 c3 mov %r8,%r11 707e: 4d 89 c4 mov %r8,%r12 7081: 4d 89 c5 mov %r8,%r13 7084: 4d 89 c6 mov %r8,%r14 7087: 4d 89 c7 mov %r8,%r15 000000000000708a : 708a: 49 d1 cf ror %r15 708d: 49 d1 e6 shl %r14 7090: 49 d1 cd ror %r13 7093: 49 d1 e4 shl %r12 7096: 49 d1 cb ror %r11 7099: 49 d1 e7 shl %r15 709c: 49 d1 ce ror %r14 709f: 49 d1 e5 shl %r13 70a2: 49 d1 cc ror %r12 70a5: 49 d1 e3 shl %r11 70a8: 49 d1 cf ror %r15 70ab: 49 d1 e6 shl %r14 70ae: 49 d1 cd ror %r13 70b1: 49 d1 e4 shl %r12 70b4: 49 d1 cb ror %r11 70b7: 49 d1 e7 shl %r15 70ba: 49 d1 ce ror %r14 70bd: 49 d1 e5 shl %r13 70c0: 49 d1 cc ror %r12 70c3: 49 d1 e3 shl %r11 70c6: 4c 29 cf sub %r9,%rdi 70c9: 75 bf jne 708a 70cb: 41 5f pop %r15 70cd: 41 5e pop %r14 70cf: 41 5d pop %r13 70d1: 41 5c pop %r12 70d3: 41 5b pop %r11 70d5: 41 5a pop %r10 70d7: 41 59 pop %r9 70d9: 41 58 pop %r8 70db: 59 pop %rcx 70dc: 5b pop %rbx 70dd: c3 retq 00000000000070de : 70de: 53 push %rbx 70df: 51 push %rcx 70e0: 56 push %rsi 70e1: 52 push %rdx 70e2: 41 50 push %r8 70e4: 41 51 push %r9 70e6: 41 52 push %r10 70e8: 41 53 push %r11 70ea: 41 54 push %r12 70ec: 41 55 push %r13 70ee: 41 56 push %r14 70f0: 41 57 push %r15 70f2: 49 c7 c0 03 00 00 00 mov $0x3,%r8 70f9: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7100: 4c 89 c3 mov %r8,%rbx 7103: 4c 89 c1 mov %r8,%rcx 7106: 4d 89 c2 mov %r8,%r10 7109: 4d 89 c3 mov %r8,%r11 710c: 4d 89 c4 mov %r8,%r12 710f: 4d 89 c5 mov %r8,%r13 7112: 4d 89 c6 mov %r8,%r14 7115: 4d 89 c7 mov %r8,%r15 0000000000007118 : 7118: 49 d1 cf ror %r15 711b: 4d 0f af f0 imul %r8,%r14 711f: 4d 89 ce mov %r9,%r14 7122: 49 d1 cd ror %r13 7125: 4d 0f af e0 imul %r8,%r12 7129: 4d 89 cc mov %r9,%r12 712c: 49 d1 cb ror %r11 712f: 4d 0f af d0 imul %r8,%r10 7133: 4d 89 ca mov %r9,%r10 7136: 48 d1 cb ror %rbx 7139: 49 0f af c8 imul %r8,%rcx 713d: 4c 89 c9 mov %r9,%rcx 7140: 48 d1 ce ror %rsi 7143: 49 0f af c0 imul %r8,%rax 7147: 4c 89 c8 mov %r9,%rax 714a: 49 d1 cf ror %r15 714d: 4d 0f af f0 imul %r8,%r14 7151: 4d 89 ce mov %r9,%r14 7154: 49 d1 cd ror %r13 7157: 4d 0f af e0 imul %r8,%r12 715b: 4d 89 cc mov %r9,%r12 715e: 49 d1 cb ror %r11 7161: 4d 0f af d0 imul %r8,%r10 7165: 4d 89 ca mov %r9,%r10 7168: 48 d1 cb ror %rbx 716b: 49 0f af c8 imul %r8,%rcx 716f: 4c 89 c9 mov %r9,%rcx 7172: 48 d1 ce ror %rsi 7175: 49 0f af d0 imul %r8,%rdx 7179: 4c 29 cf sub %r9,%rdi 717c: 75 9a jne 7118 717e: 41 5f pop %r15 7180: 41 5e pop %r14 7182: 41 5d pop %r13 7184: 41 5c pop %r12 7186: 41 5b pop %r11 7188: 41 5a pop %r10 718a: 41 59 pop %r9 718c: 41 58 pop %r8 718e: 5a pop %rdx 718f: 5e pop %rsi 7190: 59 pop %rcx 7191: 5b pop %rbx 7192: c3 retq 0000000000007193 : 7193: 53 push %rbx 7194: 51 push %rcx 7195: 52 push %rdx 7196: 56 push %rsi 7197: 41 50 push %r8 7199: 41 51 push %r9 719b: 41 52 push %r10 719d: 41 53 push %r11 719f: 41 54 push %r12 71a1: 41 55 push %r13 71a3: 41 56 push %r14 71a5: 41 57 push %r15 71a7: 49 c7 c0 01 00 00 00 mov $0x1,%r8 71ae: 49 c7 c1 14 00 00 00 mov $0x14,%r9 71b5: 4c 89 c3 mov %r8,%rbx 71b8: 4c 89 c1 mov %r8,%rcx 71bb: 4d 89 c2 mov %r8,%r10 71be: 4d 89 c3 mov %r8,%r11 71c1: 4d 89 c4 mov %r8,%r12 71c4: 4d 89 c5 mov %r8,%r13 71c7: 4d 89 c6 mov %r8,%r14 71ca: 4d 89 c7 mov %r8,%r15 71cd: 49 ff c0 inc %r8 00000000000071d0 : 71d0: 4d 0f ab c7 bts %r8,%r15 71d4: 49 d1 ce ror %r14 71d7: 4d 0f ab c5 bts %r8,%r13 71db: 49 d1 cc ror %r12 71de: 4d 0f ab c3 bts %r8,%r11 71e2: 49 d1 ca ror %r10 71e5: 4c 0f ab c1 bts %r8,%rcx 71e9: 48 d1 cb ror %rbx 71ec: 4c 0f ab c2 bts %r8,%rdx 71f0: 48 d1 ce ror %rsi 71f3: 4d 0f ab c7 bts %r8,%r15 71f7: 49 d1 ce ror %r14 71fa: 4d 0f ab c5 bts %r8,%r13 71fe: 49 d1 cc ror %r12 7201: 4d 0f ab c3 bts %r8,%r11 7205: 49 d1 ca ror %r10 7208: 4c 0f ab c1 bts %r8,%rcx 720c: 48 d1 cb ror %rbx 720f: 4c 0f ab c2 bts %r8,%rdx 7213: 48 d1 ce ror %rsi 7216: 4c 29 cf sub %r9,%rdi 7219: 75 b5 jne 71d0 721b: 41 5f pop %r15 721d: 41 5e pop %r14 721f: 41 5d pop %r13 7221: 41 5c pop %r12 7223: 41 5b pop %r11 7225: 41 5a pop %r10 7227: 41 59 pop %r9 7229: 41 58 pop %r8 722b: 5e pop %rsi 722c: 5a pop %rdx 722d: 59 pop %rcx 722e: 5b pop %rbx 722f: c3 retq 0000000000007230 : 7230: 53 push %rbx 7231: 51 push %rcx 7232: 41 50 push %r8 7234: 41 51 push %r9 7236: 41 52 push %r10 7238: 41 53 push %r11 723a: 41 54 push %r12 723c: 41 55 push %r13 723e: 41 56 push %r14 7240: 41 57 push %r15 7242: 49 c7 c0 01 00 00 00 mov $0x1,%r8 7249: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7250: 4c 89 c3 mov %r8,%rbx 7253: 4c 89 c1 mov %r8,%rcx 7256: 4d 89 c2 mov %r8,%r10 7259: 4d 89 c3 mov %r8,%r11 725c: 4d 89 c4 mov %r8,%r12 725f: 4d 89 c5 mov %r8,%r13 7262: 4d 89 c6 mov %r8,%r14 7265: 4d 89 c7 mov %r8,%r15 7268: 49 ff c0 inc %r8 000000000000726b : 726b: 4d 0f ab c7 bts %r8,%r15 726f: 4d 0f ab c6 bts %r8,%r14 7273: 4d 0f ab c5 bts %r8,%r13 7277: 4d 0f ab c4 bts %r8,%r12 727b: 4d 0f ab c3 bts %r8,%r11 727f: 4d 0f ab c7 bts %r8,%r15 7283: 4d 0f ab c6 bts %r8,%r14 7287: 4d 0f ab c5 bts %r8,%r13 728b: 4d 0f ab c4 bts %r8,%r12 728f: 4d 0f ab c3 bts %r8,%r11 7293: 4d 0f ab c7 bts %r8,%r15 7297: 4d 0f ab c6 bts %r8,%r14 729b: 4d 0f ab c5 bts %r8,%r13 729f: 4d 0f ab c4 bts %r8,%r12 72a3: 4d 0f ab c3 bts %r8,%r11 72a7: 4d 0f ab c7 bts %r8,%r15 72ab: 4d 0f ab c6 bts %r8,%r14 72af: 4d 0f ab c5 bts %r8,%r13 72b3: 4d 0f ab c4 bts %r8,%r12 72b7: 4d 0f ab c3 bts %r8,%r11 72bb: 4c 29 cf sub %r9,%rdi 72be: 75 ab jne 726b 72c0: 41 5f pop %r15 72c2: 41 5e pop %r14 72c4: 41 5d pop %r13 72c6: 41 5c pop %r12 72c8: 41 5b pop %r11 72ca: 41 5a pop %r10 72cc: 41 59 pop %r9 72ce: 41 58 pop %r8 72d0: 59 pop %rcx 72d1: 5b pop %rbx 72d2: c3 retq 00000000000072d3 : 72d3: 53 push %rbx 72d4: 51 push %rcx 72d5: 41 50 push %r8 72d7: 41 51 push %r9 72d9: 41 52 push %r10 72db: 41 53 push %r11 72dd: 41 54 push %r12 72df: 41 55 push %r13 72e1: 41 56 push %r14 72e3: 41 57 push %r15 72e5: 49 c7 c0 01 00 00 00 mov $0x1,%r8 72ec: 49 c7 c1 14 00 00 00 mov $0x14,%r9 72f3: 4c 89 c3 mov %r8,%rbx 72f6: 4c 89 c1 mov %r8,%rcx 72f9: 4d 89 c2 mov %r8,%r10 72fc: 4d 89 c3 mov %r8,%r11 72ff: 4d 89 c4 mov %r8,%r12 7302: 4d 89 c5 mov %r8,%r13 7305: 4d 89 c6 mov %r8,%r14 7308: 4d 89 c7 mov %r8,%r15 730b: 49 ff c0 inc %r8 000000000000730e : 730e: 4f 8d 14 d1 lea (%r9,%r10,8),%r10 7312: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 7316: 4f 8d 24 e1 lea (%r9,%r12,8),%r12 731a: 4f 8d 2c e9 lea (%r9,%r13,8),%r13 731e: 4f 8d 34 f1 lea (%r9,%r14,8),%r14 7322: 4f 8d 3c f9 lea (%r9,%r15,8),%r15 7326: 4f 8d 14 d1 lea (%r9,%r10,8),%r10 732a: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 732e: 4f 8d 24 e1 lea (%r9,%r12,8),%r12 7332: 4f 8d 2c e9 lea (%r9,%r13,8),%r13 7336: 4f 8d 34 f1 lea (%r9,%r14,8),%r14 733a: 4f 8d 3c f9 lea (%r9,%r15,8),%r15 733e: 4f 8d 14 d1 lea (%r9,%r10,8),%r10 7342: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 7346: 4f 8d 24 e1 lea (%r9,%r12,8),%r12 734a: 4f 8d 2c e9 lea (%r9,%r13,8),%r13 734e: 4f 8d 34 f1 lea (%r9,%r14,8),%r14 7352: 4f 8d 3c f9 lea (%r9,%r15,8),%r15 7356: 4f 8d 14 d1 lea (%r9,%r10,8),%r10 735a: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 735e: 4c 29 cf sub %r9,%rdi 7361: 75 ab jne 730e 7363: 41 5f pop %r15 7365: 41 5e pop %r14 7367: 41 5d pop %r13 7369: 41 5c pop %r12 736b: 41 5b pop %r11 736d: 41 5a pop %r10 736f: 41 59 pop %r9 7371: 41 58 pop %r8 7373: 59 pop %rcx 7374: 5b pop %rbx 7375: c3 retq 0000000000007376 : 7376: 53 push %rbx 7377: 51 push %rcx 7378: 52 push %rdx 7379: 56 push %rsi 737a: 41 50 push %r8 737c: 41 51 push %r9 737e: 41 52 push %r10 7380: 41 53 push %r11 7382: 41 54 push %r12 7384: 41 55 push %r13 7386: 41 56 push %r14 7388: 41 57 push %r15 738a: 49 c7 c0 01 00 00 00 mov $0x1,%r8 7391: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7398: 4c 89 c3 mov %r8,%rbx 739b: 4c 89 c1 mov %r8,%rcx 739e: 4d 89 c2 mov %r8,%r10 73a1: 4d 89 c3 mov %r8,%r11 73a4: 4d 89 c4 mov %r8,%r12 73a7: 4d 89 c5 mov %r8,%r13 73aa: 4d 89 c6 mov %r8,%r14 73ad: 4d 89 c7 mov %r8,%r15 73b0: 49 ff c0 inc %r8 00000000000073b3 : 73b3: 4f 8d 3c f9 lea (%r9,%r15,8),%r15 73b7: 4d 0f af f0 imul %r8,%r14 73bb: 4d 89 c6 mov %r8,%r14 73be: 4f 8d 2c e9 lea (%r9,%r13,8),%r13 73c2: 4d 0f af e0 imul %r8,%r12 73c6: 4d 89 c4 mov %r8,%r12 73c9: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 73cd: 4d 0f af d0 imul %r8,%r10 73d1: 4d 89 c2 mov %r8,%r10 73d4: 49 8d 1c d9 lea (%r9,%rbx,8),%rbx 73d8: 49 0f af c8 imul %r8,%rcx 73dc: 4c 89 c1 mov %r8,%rcx 73df: 49 8d 14 d1 lea (%r9,%rdx,8),%rdx 73e3: 49 0f af c0 imul %r8,%rax 73e7: 4f 8d 3c f9 lea (%r9,%r15,8),%r15 73eb: 4d 0f af f0 imul %r8,%r14 73ef: 4f 8d 2c e9 lea (%r9,%r13,8),%r13 73f3: 4d 0f af e0 imul %r8,%r12 73f7: 4f 8d 1c d9 lea (%r9,%r11,8),%r11 73fb: 4d 0f af d0 imul %r8,%r10 73ff: 49 8d 1c d9 lea (%r9,%rbx,8),%rbx 7403: 49 0f af c8 imul %r8,%rcx 7407: 49 8d 14 d1 lea (%r9,%rdx,8),%rdx 740b: 49 0f af c0 imul %r8,%rax 740f: 4c 29 cf sub %r9,%rdi 7412: 75 9f jne 73b3 7414: 41 5f pop %r15 7416: 41 5e pop %r14 7418: 41 5d pop %r13 741a: 41 5c pop %r12 741c: 41 5b pop %r11 741e: 41 5a pop %r10 7420: 41 59 pop %r9 7422: 41 58 pop %r8 7424: 5e pop %rsi 7425: 5a pop %rdx 7426: 59 pop %rcx 7427: 5b pop %rbx 7428: c3 retq 0000000000007429 : 7429: 53 push %rbx 742a: 51 push %rcx 742b: 56 push %rsi 742c: 52 push %rdx 742d: 41 50 push %r8 742f: 41 51 push %r9 7431: 41 52 push %r10 7433: 41 53 push %r11 7435: 41 54 push %r12 7437: 41 55 push %r13 7439: 41 56 push %r14 743b: 41 57 push %r15 743d: 49 c7 c0 01 00 00 00 mov $0x1,%r8 7444: 49 c7 c1 14 00 00 00 mov $0x14,%r9 744b: 4c 89 c3 mov %r8,%rbx 744e: 4c 89 c2 mov %r8,%rdx 7451: 4c 89 c6 mov %r8,%rsi 7454: 4c 89 c1 mov %r8,%rcx 7457: 4d 89 c2 mov %r8,%r10 745a: 4d 89 c3 mov %r8,%r11 745d: 4d 89 c4 mov %r8,%r12 7460: 4d 89 c5 mov %r8,%r13 7463: 4d 89 c6 mov %r8,%r14 7466: 4d 89 c7 mov %r8,%r15 7469: 49 ff c0 inc %r8 000000000000746c : 746c: 4d 0f af f0 imul %r8,%r14 7470: 4d 0f ab c5 bts %r8,%r13 7474: 4d 89 c5 mov %r8,%r13 7477: 4d 0f af e0 imul %r8,%r12 747b: 4d 0f ab c3 bts %r8,%r11 747f: 4d 89 c3 mov %r8,%r11 7482: 4d 0f af d0 imul %r8,%r10 7486: 4c 0f ab c3 bts %r8,%rbx 748a: 49 0f af c8 imul %r8,%rcx 748e: 4c 89 c1 mov %r8,%rcx 7491: 4c 0f ab c6 bts %r8,%rsi 7495: 49 0f af c0 imul %r8,%rax 7499: 4c 89 c0 mov %r8,%rax 749c: 4d 0f ab c7 bts %r8,%r15 74a0: 4d 0f af f0 imul %r8,%r14 74a4: 4d 89 c6 mov %r8,%r14 74a7: 4d 0f ab c5 bts %r8,%r13 74ab: 4d 0f af e0 imul %r8,%r12 74af: 4d 89 c4 mov %r8,%r12 74b2: 4d 0f ab c3 bts %r8,%r11 74b6: 4d 0f af d0 imul %r8,%r10 74ba: 4d 89 c2 mov %r8,%r10 74bd: 4c 0f ab c3 bts %r8,%rbx 74c1: 49 0f af c8 imul %r8,%rcx 74c5: 4c 89 c1 mov %r8,%rcx 74c8: 4c 0f ab c6 bts %r8,%rsi 74cc: 49 0f af d0 imul %r8,%rdx 74d0: 4c 89 c2 mov %r8,%rdx 74d3: 4d 0f ab c3 bts %r8,%r11 74d7: 4c 29 cf sub %r9,%rdi 74da: 75 90 jne 746c 74dc: 41 5f pop %r15 74de: 41 5e pop %r14 74e0: 41 5d pop %r13 74e2: 41 5c pop %r12 74e4: 41 5b pop %r11 74e6: 41 5a pop %r10 74e8: 41 59 pop %r9 74ea: 41 58 pop %r8 74ec: 5a pop %rdx 74ed: 5e pop %rsi 74ee: 59 pop %rcx 74ef: 5b pop %rbx 74f0: c3 retq 00000000000074f1 : 74f1: 56 push %rsi 74f2: 53 push %rbx 74f3: 51 push %rcx 74f4: 52 push %rdx 74f5: 41 50 push %r8 74f7: 41 51 push %r9 74f9: 41 52 push %r10 74fb: 41 53 push %r11 74fd: 41 54 push %r12 74ff: 41 55 push %r13 7501: 41 56 push %r14 7503: 41 57 push %r15 7505: 49 c7 c0 01 00 00 00 mov $0x1,%r8 750c: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7513: 48 31 db xor %rbx,%rbx 7516: 48 31 c9 xor %rcx,%rcx 7519: 4d 31 d2 xor %r10,%r10 751c: 4d 31 db xor %r11,%r11 751f: 4d 31 e4 xor %r12,%r12 7522: 4d 31 ed xor %r13,%r13 7525: 4d 31 f6 xor %r14,%r14 7528: 4d 31 ff xor %r15,%r15 752b: 48 31 f6 xor %rsi,%rsi 752e: 4d 89 c2 mov %r8,%r10 7531: 4d 89 c3 mov %r8,%r11 7534: 4c 89 c6 mov %r8,%rsi 7537: 4c 89 c0 mov %r8,%rax 753a: 4c 89 c2 mov %r8,%rdx 000000000000753d : 753d: eb 04 jmp 7543 753f: 48 83 c0 01 add $0x1,%rax 0000000000007543 : 7543: eb 04 jmp 7549 7545: 48 83 c0 02 add $0x2,%rax 0000000000007549 : 7549: eb 04 jmp 754f 754b: 48 83 c0 03 add $0x3,%rax 000000000000754f : 754f: eb 04 jmp 7555 7551: 48 83 c0 04 add $0x4,%rax 0000000000007555 : 7555: eb 04 jmp 755b 7557: 48 83 c0 05 add $0x5,%rax 000000000000755b : 755b: eb 04 jmp 7561 755d: 48 83 c0 06 add $0x6,%rax 0000000000007561 : 7561: eb 04 jmp 7567 7563: 48 83 c0 07 add $0x7,%rax 0000000000007567 : 7567: eb 04 jmp 756d 7569: 48 83 c0 08 add $0x8,%rax 000000000000756d : 756d: eb 04 jmp 7573 756f: 48 83 c0 09 add $0x9,%rax 0000000000007573 : 7573: eb 04 jmp 7579 7575: 48 83 c0 0a add $0xa,%rax 0000000000007579 : 7579: eb 04 jmp 757f 757b: 48 83 c0 0b add $0xb,%rax 000000000000757f : 757f: eb 04 jmp 7585 7581: 48 83 c0 0c add $0xc,%rax 0000000000007585 : 7585: eb 04 jmp 758b 7587: 48 83 c0 0d add $0xd,%rax 000000000000758b : 758b: eb 04 jmp 7591 758d: 48 83 c0 0e add $0xe,%rax 0000000000007591 : 7591: eb 04 jmp 7597 7593: 48 83 c0 0f add $0xf,%rax 0000000000007597 : 7597: eb 04 jmp 759d 7599: 48 83 c0 10 add $0x10,%rax 000000000000759d : 759d: eb 04 jmp 75a3 759f: 48 83 c0 11 add $0x11,%rax 00000000000075a3 : 75a3: eb 04 jmp 75a9 75a5: 48 83 c0 12 add $0x12,%rax 00000000000075a9 : 75a9: eb 04 jmp 75af 75ab: 48 83 c0 13 add $0x13,%rax 00000000000075af : 75af: 4c 29 cf sub %r9,%rdi 75b2: 75 89 jne 753d 00000000000075b4 : 75b4: 41 5f pop %r15 75b6: 41 5e pop %r14 75b8: 41 5d pop %r13 75ba: 41 5c pop %r12 75bc: 41 5b pop %r11 75be: 41 5a pop %r10 75c0: 41 59 pop %r9 75c2: 41 58 pop %r8 75c4: 5a pop %rdx 75c5: 59 pop %rcx 75c6: 5b pop %rbx 75c7: 5e pop %rsi 75c8: c3 retq 00000000000075c9 : 75c9: 56 push %rsi 75ca: 53 push %rbx 75cb: 51 push %rcx 75cc: 52 push %rdx 75cd: 41 50 push %r8 75cf: 41 51 push %r9 75d1: 41 52 push %r10 75d3: 41 53 push %r11 75d5: 41 54 push %r12 75d7: 41 55 push %r13 75d9: 41 56 push %r14 75db: 41 57 push %r15 75dd: 49 c7 c0 01 00 00 00 mov $0x1,%r8 75e4: 49 c7 c1 14 00 00 00 mov $0x14,%r9 75eb: 48 31 db xor %rbx,%rbx 75ee: 48 31 c9 xor %rcx,%rcx 75f1: 4d 31 d2 xor %r10,%r10 75f4: 4d 31 db xor %r11,%r11 75f7: 4d 31 e4 xor %r12,%r12 75fa: 4d 31 ed xor %r13,%r13 75fd: 4d 31 f6 xor %r14,%r14 7600: 4d 31 ff xor %r15,%r15 7603: 48 31 f6 xor %rsi,%rsi 7606: 4d 89 c2 mov %r8,%r10 7609: 4d 89 c3 mov %r8,%r11 760c: 4c 89 c6 mov %r8,%rsi 760f: 4c 89 c0 mov %r8,%rax 7612: 4c 89 c2 mov %r8,%rdx 0000000000007615 : 7615: 4d 39 c1 cmp %r8,%r9 7618: 0f 84 71 01 00 00 je 778f 761e: 4d 39 c1 cmp %r8,%r9 7621: 0f 84 68 01 00 00 je 778f 7627: 4d 39 c1 cmp %r8,%r9 762a: 0f 84 5f 01 00 00 je 778f 7630: 4d 39 c1 cmp %r8,%r9 7633: 0f 84 56 01 00 00 je 778f 7639: 4d 39 c1 cmp %r8,%r9 763c: 0f 84 4d 01 00 00 je 778f 7642: 4d 39 c1 cmp %r8,%r9 7645: 0f 84 44 01 00 00 je 778f 764b: 4d 39 c1 cmp %r8,%r9 764e: 0f 84 3b 01 00 00 je 778f 7654: 4d 39 c1 cmp %r8,%r9 7657: 0f 84 32 01 00 00 je 778f 765d: 4d 39 c1 cmp %r8,%r9 7660: 0f 84 29 01 00 00 je 778f 7666: 4d 39 c1 cmp %r8,%r9 7669: 0f 84 20 01 00 00 je 778f 766f: 4d 39 c1 cmp %r8,%r9 7672: 0f 84 17 01 00 00 je 778f 7678: 4d 39 c1 cmp %r8,%r9 767b: 0f 84 0e 01 00 00 je 778f 7681: 4d 39 c1 cmp %r8,%r9 7684: 0f 84 05 01 00 00 je 778f 768a: 4d 39 c1 cmp %r8,%r9 768d: 0f 84 fc 00 00 00 je 778f 7693: 4d 39 c1 cmp %r8,%r9 7696: 0f 84 f3 00 00 00 je 778f 769c: 4d 39 c1 cmp %r8,%r9 769f: 0f 84 ea 00 00 00 je 778f 76a5: 4d 39 c1 cmp %r8,%r9 76a8: 0f 84 e1 00 00 00 je 778f 76ae: 4d 39 c1 cmp %r8,%r9 76b1: 0f 84 d8 00 00 00 je 778f 76b7: 4d 39 c1 cmp %r8,%r9 76ba: 0f 84 cf 00 00 00 je 778f 76c0: 4d 39 c1 cmp %r8,%r9 76c3: 0f 84 c6 00 00 00 je 778f 76c9: 4c 29 cf sub %r9,%rdi 76cc: 0f 85 43 ff ff ff jne 7615 00000000000076d2 : 76d2: 41 5f pop %r15 76d4: 41 5e pop %r14 76d6: 41 5d pop %r13 76d8: 41 5c pop %r12 76da: 41 5b pop %r11 76dc: 41 5a pop %r10 76de: 41 59 pop %r9 76e0: 41 58 pop %r8 76e2: 5a pop %rdx 76e3: 59 pop %rcx 76e4: 5b pop %rbx 76e5: 5e pop %rsi 76e6: c3 retq 00000000000076e7 : 76e7: 56 push %rsi 76e8: 53 push %rbx 76e9: 51 push %rcx 76ea: 52 push %rdx 76eb: 41 50 push %r8 76ed: 41 51 push %r9 76ef: 41 52 push %r10 76f1: 41 53 push %r11 76f3: 41 54 push %r12 76f5: 41 55 push %r13 76f7: 41 56 push %r14 76f9: 41 57 push %r15 76fb: 49 c7 c0 02 00 00 00 mov $0x2,%r8 7702: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7709: 48 31 db xor %rbx,%rbx 770c: 48 31 c9 xor %rcx,%rcx 770f: 4d 31 db xor %r11,%r11 7712: 4d 31 e4 xor %r12,%r12 7715: 4d 31 ed xor %r13,%r13 7718: 4d 31 f6 xor %r14,%r14 771b: 4d 31 ff xor %r15,%r15 771e: 48 31 f6 xor %rsi,%rsi 7721: 4d 89 c2 mov %r8,%r10 7724: 4d 89 c3 mov %r8,%r11 7727: 4c 89 c6 mov %r8,%rsi 772a: 4c 89 c0 mov %r8,%rax 772d: 4c 89 c2 mov %r8,%rdx 0000000000007730 : 7730: 4d 39 c1 cmp %r8,%r9 7733: 74 5a je 778f 7735: 45 0f af d0 imul %r8d,%r10d 7739: 4d 39 c1 cmp %r8,%r9 773c: 74 51 je 778f 773e: 41 0f af f0 imul %r8d,%esi 7742: 4d 39 c1 cmp %r8,%r9 7745: 74 48 je 778f 7747: 41 0f af d8 imul %r8d,%ebx 774b: 4d 39 c1 cmp %r8,%r9 774e: 74 3f je 778f 7750: 41 0f af d0 imul %r8d,%edx 7754: 4d 39 c1 cmp %r8,%r9 7757: 74 36 je 778f 7759: 45 0f af d0 imul %r8d,%r10d 775d: 4d 39 c1 cmp %r8,%r9 7760: 74 2d je 778f 7762: 41 0f af f0 imul %r8d,%esi 7766: 4d 39 c1 cmp %r8,%r9 7769: 74 24 je 778f 776b: 41 0f af d8 imul %r8d,%ebx 776f: 4d 39 c1 cmp %r8,%r9 7772: 74 1b je 778f 7774: 41 0f af d0 imul %r8d,%edx 7778: 4d 39 c1 cmp %r8,%r9 777b: 74 12 je 778f 777d: 45 0f af f8 imul %r8d,%r15d 7781: 4d 39 c1 cmp %r8,%r9 7784: 74 09 je 778f 7786: 45 0f af f0 imul %r8d,%r14d 778a: 4c 29 cf sub %r9,%rdi 778d: 75 a1 jne 7730 000000000000778f : 778f: 41 5f pop %r15 7791: 41 5e pop %r14 7793: 41 5d pop %r13 7795: 41 5c pop %r12 7797: 41 5b pop %r11 7799: 41 5a pop %r10 779b: 41 59 pop %r9 779d: 41 58 pop %r8 779f: 5a pop %rdx 77a0: 59 pop %rcx 77a1: 5b pop %rbx 77a2: 5e pop %rsi 77a3: c3 retq 00000000000077a4 : 77a4: 56 push %rsi 77a5: 53 push %rbx 77a6: 51 push %rcx 77a7: 52 push %rdx 77a8: 41 50 push %r8 77aa: 41 51 push %r9 77ac: 41 52 push %r10 77ae: 41 53 push %r11 77b0: 41 54 push %r12 77b2: 41 55 push %r13 77b4: 41 56 push %r14 77b6: 41 57 push %r15 77b8: 49 c7 c0 01 00 00 00 mov $0x1,%r8 77bf: 49 c7 c1 28 00 00 00 mov $0x28,%r9 77c6: 48 31 db xor %rbx,%rbx 77c9: 48 31 c9 xor %rcx,%rcx 77cc: 4d 31 d2 xor %r10,%r10 77cf: 4d 31 db xor %r11,%r11 77d2: 4d 31 e4 xor %r12,%r12 77d5: 4d 31 ed xor %r13,%r13 77d8: 4d 31 f6 xor %r14,%r14 77db: 4d 31 ff xor %r15,%r15 77de: 48 31 f6 xor %rsi,%rsi 77e1: 4d 89 c2 mov %r8,%r10 77e4: 4d 89 c3 mov %r8,%r11 77e7: 4c 89 c6 mov %r8,%rsi 77ea: 4c 89 c0 mov %r8,%rax 77ed: 4c 89 c2 mov %r8,%rdx 00000000000077f0 : 77f0: 4d 01 c7 add %r8,%r15 77f3: 4d 01 c6 add %r8,%r14 77f6: 4d 01 c5 add %r8,%r13 77f9: 4d 01 c4 add %r8,%r12 77fc: 4d 0f af d0 imul %r8,%r10 7800: 4d 01 c7 add %r8,%r15 7803: 4d 01 c6 add %r8,%r14 7806: 4d 01 c5 add %r8,%r13 7809: 4d 01 c4 add %r8,%r12 780c: 49 0f af f0 imul %r8,%rsi 7810: 4d 01 c7 add %r8,%r15 7813: 4d 01 c6 add %r8,%r14 7816: 4d 01 c5 add %r8,%r13 7819: 4d 01 c4 add %r8,%r12 781c: 49 0f af d8 imul %r8,%rbx 7820: 4d 01 c7 add %r8,%r15 7823: 4d 01 c7 add %r8,%r15 7826: 4d 01 c5 add %r8,%r13 7829: 4d 01 c4 add %r8,%r12 782c: 49 0f af d0 imul %r8,%rdx 7830: 4d 01 c7 add %r8,%r15 7833: 4d 01 c6 add %r8,%r14 7836: 4d 01 c5 add %r8,%r13 7839: 4d 01 c4 add %r8,%r12 783c: 4d 0f af d0 imul %r8,%r10 7840: 4d 01 c7 add %r8,%r15 7843: 4d 01 c6 add %r8,%r14 7846: 4d 01 c5 add %r8,%r13 7849: 4d 01 c4 add %r8,%r12 784c: 49 0f af f0 imul %r8,%rsi 7850: 4d 01 c7 add %r8,%r15 7853: 4d 01 c6 add %r8,%r14 7856: 4d 01 c5 add %r8,%r13 7859: 4d 01 c4 add %r8,%r12 785c: 49 0f af d8 imul %r8,%rbx 7860: 4d 01 c7 add %r8,%r15 7863: 4d 01 c5 add %r8,%r13 7866: 4d 01 c4 add %r8,%r12 7869: 49 0f af d0 imul %r8,%rdx 786d: 4c 29 cf sub %r9,%rdi 7870: 0f 85 7a ff ff ff jne 77f0 7876: 41 5f pop %r15 7878: 41 5e pop %r14 787a: 41 5d pop %r13 787c: 41 5c pop %r12 787e: 41 5b pop %r11 7880: 41 5a pop %r10 7882: 41 59 pop %r9 7884: 41 58 pop %r8 7886: 5a pop %rdx 7887: 59 pop %rcx 7888: 5b pop %rbx 7889: 5e pop %rsi 788a: c3 retq 000000000000788b : 788b: 41 51 push %r9 788d: 41 50 push %r8 788f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7896: 66 49 0f 6e c9 movq %r9,%xmm1 789b: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 789f: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 78a3: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 78a7: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 78ab: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 00000000000078af : 78af: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 78b3: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 78b7: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 78bb: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 78bf: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 78c3: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 78c7: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 78cb: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 78cf: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 78d3: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 78d7: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 78db: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 78df: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 78e3: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 78e7: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 78eb: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 78ef: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 78f3: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 78f7: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 78fb: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 78ff: 4c 29 cf sub %r9,%rdi 7902: 75 ab jne 78af 7904: 66 48 0f 7e c8 movq %xmm1,%rax 7909: c5 f8 77 vzeroupper 790c: 41 58 pop %r8 790e: 41 59 pop %r9 7910: c3 retq 0000000000007911 : 7911: 41 51 push %r9 7913: 41 50 push %r8 7915: 49 c7 c1 14 00 00 00 mov $0x14,%r9 791c: 66 49 0f 6e c9 movq %r9,%xmm1 7921: 62 f2 fd 48 59 c1 vpbroadcastq %xmm1,%zmm0 7927: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 792d: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 7933: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 7939: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 793f: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000007945 : 7945: 62 f2 75 48 40 c8 vpmulld %zmm0,%zmm1,%zmm1 794b: 62 f2 6d 48 40 d0 vpmulld %zmm0,%zmm2,%zmm2 7951: 62 f2 65 48 40 d8 vpmulld %zmm0,%zmm3,%zmm3 7957: 62 f2 5d 48 40 e0 vpmulld %zmm0,%zmm4,%zmm4 795d: 62 f2 55 48 40 e8 vpmulld %zmm0,%zmm5,%zmm5 7963: 62 f2 75 48 40 c8 vpmulld %zmm0,%zmm1,%zmm1 7969: 62 f2 6d 48 40 d0 vpmulld %zmm0,%zmm2,%zmm2 796f: 62 f2 65 48 40 d8 vpmulld %zmm0,%zmm3,%zmm3 7975: 62 f2 5d 48 40 e0 vpmulld %zmm0,%zmm4,%zmm4 797b: 62 f2 55 48 40 e8 vpmulld %zmm0,%zmm5,%zmm5 7981: 62 f2 75 48 40 c8 vpmulld %zmm0,%zmm1,%zmm1 7987: 62 f2 6d 48 40 d0 vpmulld %zmm0,%zmm2,%zmm2 798d: 62 f2 65 48 40 d8 vpmulld %zmm0,%zmm3,%zmm3 7993: 62 f2 5d 48 40 e0 vpmulld %zmm0,%zmm4,%zmm4 7999: 62 f2 55 48 40 e8 vpmulld %zmm0,%zmm5,%zmm5 799f: 62 f2 75 48 40 c8 vpmulld %zmm0,%zmm1,%zmm1 79a5: 62 f2 6d 48 40 d0 vpmulld %zmm0,%zmm2,%zmm2 79ab: 62 f2 65 48 40 d8 vpmulld %zmm0,%zmm3,%zmm3 79b1: 62 f2 5d 48 40 e0 vpmulld %zmm0,%zmm4,%zmm4 79b7: 62 f2 55 48 40 e8 vpmulld %zmm0,%zmm5,%zmm5 79bd: 4c 29 cf sub %r9,%rdi 79c0: 75 83 jne 7945 79c2: 66 48 0f 7e c8 movq %xmm1,%rax 79c7: c5 f8 77 vzeroupper 79ca: 41 58 pop %r8 79cc: 41 59 pop %r9 79ce: c3 retq 00000000000079cf : 79cf: 41 51 push %r9 79d1: 41 50 push %r8 79d3: 49 c7 c1 14 00 00 00 mov $0x14,%r9 79da: 66 49 0f 6e c9 movq %r9,%xmm1 79df: 62 f2 fd 48 59 c1 vpbroadcastq %xmm1,%zmm0 79e5: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 79eb: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 79f1: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 79f7: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 79fd: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000007a03 : 7a03: 62 f2 f5 48 28 c8 vpmuldq %zmm0,%zmm1,%zmm1 7a09: 62 f2 ed 48 28 d0 vpmuldq %zmm0,%zmm2,%zmm2 7a0f: 62 f2 e5 48 28 d8 vpmuldq %zmm0,%zmm3,%zmm3 7a15: 62 f2 dd 48 28 e0 vpmuldq %zmm0,%zmm4,%zmm4 7a1b: 62 f2 d5 48 28 e8 vpmuldq %zmm0,%zmm5,%zmm5 7a21: 62 f2 f5 48 28 c8 vpmuldq %zmm0,%zmm1,%zmm1 7a27: 62 f2 ed 48 28 d0 vpmuldq %zmm0,%zmm2,%zmm2 7a2d: 62 f2 e5 48 28 d8 vpmuldq %zmm0,%zmm3,%zmm3 7a33: 62 f2 dd 48 28 e0 vpmuldq %zmm0,%zmm4,%zmm4 7a39: 62 f2 d5 48 28 e8 vpmuldq %zmm0,%zmm5,%zmm5 7a3f: 62 f2 f5 48 28 c8 vpmuldq %zmm0,%zmm1,%zmm1 7a45: 62 f2 ed 48 28 d0 vpmuldq %zmm0,%zmm2,%zmm2 7a4b: 62 f2 e5 48 28 d8 vpmuldq %zmm0,%zmm3,%zmm3 7a51: 62 f2 dd 48 28 e0 vpmuldq %zmm0,%zmm4,%zmm4 7a57: 62 f2 d5 48 28 e8 vpmuldq %zmm0,%zmm5,%zmm5 7a5d: 62 f2 f5 48 28 c8 vpmuldq %zmm0,%zmm1,%zmm1 7a63: 62 f2 ed 48 28 d0 vpmuldq %zmm0,%zmm2,%zmm2 7a69: 62 f2 e5 48 28 d8 vpmuldq %zmm0,%zmm3,%zmm3 7a6f: 62 f2 dd 48 28 e0 vpmuldq %zmm0,%zmm4,%zmm4 7a75: 62 f2 d5 48 28 e8 vpmuldq %zmm0,%zmm5,%zmm5 7a7b: 4c 29 cf sub %r9,%rdi 7a7e: 75 83 jne 7a03 7a80: 66 48 0f 7e c8 movq %xmm1,%rax 7a85: c5 f8 77 vzeroupper 7a88: 41 58 pop %r8 7a8a: 41 59 pop %r9 7a8c: c3 retq 0000000000007a8d : 7a8d: 41 51 push %r9 7a8f: 41 50 push %r8 7a91: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7a98: 66 49 0f 6e c9 movq %r9,%xmm1 7a9d: 62 f2 fd 48 59 c1 vpbroadcastq %xmm1,%zmm0 7aa3: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 7aa9: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 7aaf: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 7ab5: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 7abb: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000007ac1 : 7ac1: 62 f1 f5 48 d4 c8 vpaddq %zmm0,%zmm1,%zmm1 7ac7: 62 f1 ed 48 d4 d0 vpaddq %zmm0,%zmm2,%zmm2 7acd: 62 f1 e5 48 d4 d8 vpaddq %zmm0,%zmm3,%zmm3 7ad3: 62 f1 dd 48 d4 e0 vpaddq %zmm0,%zmm4,%zmm4 7ad9: 62 f1 d5 48 d4 e8 vpaddq %zmm0,%zmm5,%zmm5 7adf: 62 f1 f5 48 d4 c8 vpaddq %zmm0,%zmm1,%zmm1 7ae5: 62 f1 ed 48 d4 d0 vpaddq %zmm0,%zmm2,%zmm2 7aeb: 62 f1 e5 48 d4 d8 vpaddq %zmm0,%zmm3,%zmm3 7af1: 62 f1 dd 48 d4 e0 vpaddq %zmm0,%zmm4,%zmm4 7af7: 62 f1 d5 48 d4 e8 vpaddq %zmm0,%zmm5,%zmm5 7afd: 62 f1 f5 48 d4 c8 vpaddq %zmm0,%zmm1,%zmm1 7b03: 62 f1 ed 48 d4 d0 vpaddq %zmm0,%zmm2,%zmm2 7b09: 62 f1 e5 48 d4 d8 vpaddq %zmm0,%zmm3,%zmm3 7b0f: 62 f1 dd 48 d4 e0 vpaddq %zmm0,%zmm4,%zmm4 7b15: 62 f1 d5 48 d4 e8 vpaddq %zmm0,%zmm5,%zmm5 7b1b: 62 f1 f5 48 d4 c8 vpaddq %zmm0,%zmm1,%zmm1 7b21: 62 f1 ed 48 d4 d0 vpaddq %zmm0,%zmm2,%zmm2 7b27: 62 f1 e5 48 d4 d8 vpaddq %zmm0,%zmm3,%zmm3 7b2d: 62 f1 dd 48 d4 e0 vpaddq %zmm0,%zmm4,%zmm4 7b33: 62 f1 d5 48 d4 e8 vpaddq %zmm0,%zmm5,%zmm5 7b39: 4c 29 cf sub %r9,%rdi 7b3c: 75 83 jne 7ac1 7b3e: 66 48 0f 7e c8 movq %xmm1,%rax 7b43: c5 f8 77 vzeroupper 7b46: 41 58 pop %r8 7b48: 41 59 pop %r9 7b4a: c3 retq 0000000000007b4b : 7b4b: 41 51 push %r9 7b4d: 41 50 push %r8 7b4f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7b56: 66 49 0f 6e c9 movq %r9,%xmm1 7b5b: c4 e2 7d 59 c1 vpbroadcastq %xmm1,%ymm0 7b60: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 7b64: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 7b68: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 7b6c: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 7b70: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 7b74: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 7b79: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 7b7e: c5 fc 10 fe vmovups %ymm6,%ymm7 7b82: c5 7c 10 c6 vmovups %ymm6,%ymm8 7b86: c5 7c 10 ce vmovups %ymm6,%ymm9 7b8a: c5 7c 10 d6 vmovups %ymm6,%ymm10 7b8e: c5 7c 10 de vmovups %ymm6,%ymm11 0000000000007b92 : 7b92: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7b96: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7b9a: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7b9e: c5 3c 58 c6 vaddps %ymm6,%ymm8,%ymm8 7ba2: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7ba6: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7baa: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7bae: c5 2c 58 d6 vaddps %ymm6,%ymm10,%ymm10 7bb2: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7bb6: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7bba: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7bbe: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7bc2: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7bc6: c5 3c 58 c6 vaddps %ymm6,%ymm8,%ymm8 7bca: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7bce: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7bd2: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7bd6: c5 2c 58 d6 vaddps %ymm6,%ymm10,%ymm10 7bda: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7bde: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7be2: 4c 29 cf sub %r9,%rdi 7be5: 75 ab jne 7b92 7be7: 66 48 0f 7e c8 movq %xmm1,%rax 7bec: c5 f8 77 vzeroupper 7bef: 41 58 pop %r8 7bf1: 41 59 pop %r9 7bf3: c3 retq 0000000000007bf4 : 7bf4: 41 51 push %r9 7bf6: 41 50 push %r8 7bf8: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7bff: 66 49 0f 6e c9 movq %r9,%xmm1 7c04: c4 62 7d 59 c1 vpbroadcastq %xmm1,%ymm8 7c09: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 7c0e: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 7c13: c5 fc 10 fe vmovups %ymm6,%ymm7 7c17: c5 7c 10 ce vmovups %ymm6,%ymm9 7c1b: c5 7c 10 de vmovups %ymm6,%ymm11 7c1f: c5 7c 10 ee vmovups %ymm6,%ymm13 7c23: c5 7c 10 fe vmovups %ymm6,%ymm15 7c27: c4 41 7e 6f d0 vmovdqu %ymm8,%ymm10 7c2c: c4 41 7e 6f e0 vmovdqu %ymm8,%ymm12 7c31: c4 41 7e 6f f0 vmovdqu %ymm8,%ymm14 0000000000007c36 : 7c36: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7c3a: c4 41 3d fe c0 vpaddd %ymm8,%ymm8,%ymm8 7c3f: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7c43: c4 41 2d fe d2 vpaddd %ymm10,%ymm10,%ymm10 7c48: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7c4c: c4 41 1d fe e4 vpaddd %ymm12,%ymm12,%ymm12 7c51: c5 14 58 ee vaddps %ymm6,%ymm13,%ymm13 7c55: c4 41 0d fe f6 vpaddd %ymm14,%ymm14,%ymm14 7c5a: c5 04 58 fe vaddps %ymm6,%ymm15,%ymm15 7c5e: c5 d5 fe ed vpaddd %ymm5,%ymm5,%ymm5 7c62: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7c66: c4 41 3d fe c0 vpaddd %ymm8,%ymm8,%ymm8 7c6b: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7c6f: c4 41 2d fe d2 vpaddd %ymm10,%ymm10,%ymm10 7c74: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7c78: c4 41 1d fe e4 vpaddd %ymm12,%ymm12,%ymm12 7c7d: c5 14 58 ee vaddps %ymm6,%ymm13,%ymm13 7c81: c4 41 0d fe f6 vpaddd %ymm14,%ymm14,%ymm14 7c86: c5 04 58 fe vaddps %ymm6,%ymm15,%ymm15 7c8a: c5 d5 fe ed vpaddd %ymm5,%ymm5,%ymm5 7c8e: 4c 29 cf sub %r9,%rdi 7c91: 75 a3 jne 7c36 7c93: 66 48 0f 7e c8 movq %xmm1,%rax 7c98: c5 f8 77 vzeroupper 7c9b: 41 58 pop %r8 7c9d: 41 59 pop %r9 7c9f: c3 retq 0000000000007ca0 : 7ca0: 41 51 push %r9 7ca2: 41 50 push %r8 7ca4: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7cab: 66 49 0f 6e c9 movq %r9,%xmm1 7cb0: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 7cb5: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 7cba: c5 fc 10 ee vmovups %ymm6,%ymm5 7cbe: c5 fc 10 fe vmovups %ymm6,%ymm7 7cc2: c5 7c 10 c6 vmovups %ymm6,%ymm8 7cc6: c5 7c 10 ce vmovups %ymm6,%ymm9 7cca: c5 7c 10 d6 vmovups %ymm6,%ymm10 7cce: c5 7c 10 de vmovups %ymm6,%ymm11 7cd2: c5 7c 10 e6 vmovups %ymm6,%ymm12 7cd6: c5 7c 10 ee vmovups %ymm6,%ymm13 7cda: c5 7c 10 f6 vmovups %ymm6,%ymm14 7cde: c5 7c 10 fe vmovups %ymm6,%ymm15 0000000000007ce2 : 7ce2: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7ce6: c5 3c 59 c6 vmulps %ymm6,%ymm8,%ymm8 7cea: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7cee: c5 2c 59 d6 vmulps %ymm6,%ymm10,%ymm10 7cf2: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7cf6: c5 1c 59 e6 vmulps %ymm6,%ymm12,%ymm12 7cfa: c5 14 58 ee vaddps %ymm6,%ymm13,%ymm13 7cfe: c5 0c 59 f6 vmulps %ymm6,%ymm14,%ymm14 7d02: c5 04 58 fe vaddps %ymm6,%ymm15,%ymm15 7d06: c5 d4 59 ee vmulps %ymm6,%ymm5,%ymm5 7d0a: c5 c4 58 fe vaddps %ymm6,%ymm7,%ymm7 7d0e: c5 3c 59 c6 vmulps %ymm6,%ymm8,%ymm8 7d12: c5 34 58 ce vaddps %ymm6,%ymm9,%ymm9 7d16: c5 2c 59 d6 vmulps %ymm6,%ymm10,%ymm10 7d1a: c5 24 58 de vaddps %ymm6,%ymm11,%ymm11 7d1e: c5 1c 59 e6 vmulps %ymm6,%ymm12,%ymm12 7d22: c5 14 58 ee vaddps %ymm6,%ymm13,%ymm13 7d26: c5 0c 59 f6 vmulps %ymm6,%ymm14,%ymm14 7d2a: c5 04 58 fe vaddps %ymm6,%ymm15,%ymm15 7d2e: c5 d4 59 ee vmulps %ymm6,%ymm5,%ymm5 7d32: 4c 29 cf sub %r9,%rdi 7d35: 75 ab jne 7ce2 7d37: 66 48 0f 7e c8 movq %xmm1,%rax 7d3c: c5 f8 77 vzeroupper 7d3f: 41 58 pop %r8 7d41: 41 59 pop %r9 7d43: c3 retq 0000000000007d44 : 7d44: 41 51 push %r9 7d46: 41 50 push %r8 7d48: 41 57 push %r15 7d4a: 41 56 push %r14 7d4c: 41 55 push %r13 7d4e: 41 54 push %r12 7d50: 41 53 push %r11 7d52: 49 c7 c1 1e 00 00 00 mov $0x1e,%r9 7d59: 66 49 0f 6e c9 movq %r9,%xmm1 7d5e: c4 e2 7d 59 c1 vpbroadcastq %xmm1,%ymm0 7d63: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 7d67: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 7d6b: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 7d6f: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 7d73: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 7d77: 4d 89 cf mov %r9,%r15 7d7a: 4d 89 ce mov %r9,%r14 7d7d: 4d 89 cd mov %r9,%r13 7d80: 4d 89 cc mov %r9,%r12 7d83: 4d 89 cb mov %r9,%r11 7d86: 4d 89 c8 mov %r9,%r8 0000000000007d89 : 7d89: 4d 01 c3 add %r8,%r11 7d8c: 4d 01 c4 add %r8,%r12 7d8f: 4d 01 c5 add %r8,%r13 7d92: 4d 01 c6 add %r8,%r14 7d95: 4d 01 c7 add %r8,%r15 7d98: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7d9c: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7da0: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7da4: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7da8: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7dac: 4d 01 c3 add %r8,%r11 7daf: 4d 01 c4 add %r8,%r12 7db2: 4d 01 c5 add %r8,%r13 7db5: 4d 01 c6 add %r8,%r14 7db8: 4d 01 c7 add %r8,%r15 7dbb: 4d 01 c3 add %r8,%r11 7dbe: 4d 01 c4 add %r8,%r12 7dc1: 4d 01 c5 add %r8,%r13 7dc4: 4d 01 c6 add %r8,%r14 7dc7: 4d 01 c7 add %r8,%r15 7dca: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7dce: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7dd2: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7dd6: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7dda: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7dde: 4d 01 c3 add %r8,%r11 7de1: 4d 01 c4 add %r8,%r12 7de4: 4d 01 c5 add %r8,%r13 7de7: 4d 01 c6 add %r8,%r14 7dea: 4d 01 c7 add %r8,%r15 7ded: 4c 29 cf sub %r9,%rdi 7df0: 75 97 jne 7d89 7df2: 66 48 0f 7e c8 movq %xmm1,%rax 7df7: c5 f8 77 vzeroupper 7dfa: 41 5b pop %r11 7dfc: 41 5c pop %r12 7dfe: 41 5d pop %r13 7e00: 41 5e pop %r14 7e02: 41 5f pop %r15 7e04: 41 58 pop %r8 7e06: 41 59 pop %r9 7e08: c3 retq 0000000000007e09 : 7e09: 41 51 push %r9 7e0b: 41 50 push %r8 7e0d: 41 57 push %r15 7e0f: 41 56 push %r14 7e11: 41 55 push %r13 7e13: 41 54 push %r12 7e15: 41 53 push %r11 7e17: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7e1e: 66 49 0f 6e c9 movq %r9,%xmm1 7e23: c4 e2 7d 59 c1 vpbroadcastq %xmm1,%ymm0 7e28: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 7e2c: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 7e30: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 7e34: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 7e38: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 7e3c: 4d 89 cf mov %r9,%r15 7e3f: 4d 89 ce mov %r9,%r14 7e42: 4d 89 cd mov %r9,%r13 7e45: 4d 89 cc mov %r9,%r12 7e48: 4d 89 cb mov %r9,%r11 7e4b: 4d 89 c8 mov %r9,%r8 0000000000007e4e : 7e4e: 4d 01 c3 add %r8,%r11 7e51: 4d 01 c4 add %r8,%r12 7e54: 4d 01 c5 add %r8,%r13 7e57: 4d 01 c6 add %r8,%r14 7e5a: 4d 01 c7 add %r8,%r15 7e5d: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7e61: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7e65: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7e69: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7e6d: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7e71: 4d 01 c3 add %r8,%r11 7e74: 4d 01 c4 add %r8,%r12 7e77: 4d 01 c5 add %r8,%r13 7e7a: 4d 01 c6 add %r8,%r14 7e7d: 4d 01 c7 add %r8,%r15 7e80: c5 f5 d4 c8 vpaddq %ymm0,%ymm1,%ymm1 7e84: c5 ed d4 d0 vpaddq %ymm0,%ymm2,%ymm2 7e88: c5 e5 d4 d8 vpaddq %ymm0,%ymm3,%ymm3 7e8c: c5 dd d4 e0 vpaddq %ymm0,%ymm4,%ymm4 7e90: c5 d5 d4 e8 vpaddq %ymm0,%ymm5,%ymm5 7e94: 4c 29 cf sub %r9,%rdi 7e97: 75 b5 jne 7e4e 7e99: 66 48 0f 7e c8 movq %xmm1,%rax 7e9e: c5 f8 77 vzeroupper 7ea1: 41 5b pop %r11 7ea3: 41 5c pop %r12 7ea5: 41 5d pop %r13 7ea7: 41 5e pop %r14 7ea9: 41 5f pop %r15 7eab: 41 58 pop %r8 7ead: 41 59 pop %r9 7eaf: c3 retq 0000000000007eb0 : 7eb0: 41 51 push %r9 7eb2: 41 50 push %r8 7eb4: 41 57 push %r15 7eb6: 41 56 push %r14 7eb8: 41 55 push %r13 7eba: 41 54 push %r12 7ebc: 41 53 push %r11 7ebe: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7ec5: 66 49 0f 6e c9 movq %r9,%xmm1 7eca: c4 e2 7d 59 c1 vpbroadcastq %xmm1,%ymm0 7ecf: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 7ed3: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 7ed7: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 7edb: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 7edf: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 0000000000007ee3 : 7ee3: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7ee7: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7eeb: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7eef: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7ef3: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7ef7: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7efb: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7eff: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f03: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f07: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f0b: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f0f: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f13: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f17: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f1b: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f1f: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f23: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f27: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f2b: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f2f: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 7f33: 4c 29 cf sub %r9,%rdi 7f36: 75 ab jne 7ee3 7f38: 66 48 0f 7e c8 movq %xmm1,%rax 7f3d: c5 f8 77 vzeroupper 7f40: 41 5b pop %r11 7f42: 41 5c pop %r12 7f44: 41 5d pop %r13 7f46: 41 5e pop %r14 7f48: 41 5f pop %r15 7f4a: 41 58 pop %r8 7f4c: 41 59 pop %r9 7f4e: c3 retq 0000000000007f4f : 7f4f: 41 51 push %r9 7f51: 41 50 push %r8 7f53: 41 57 push %r15 7f55: 41 56 push %r14 7f57: 41 55 push %r13 7f59: 41 54 push %r12 7f5b: 41 53 push %r11 7f5d: 49 c7 c1 14 00 00 00 mov $0x14,%r9 7f64: 66 49 0f 6e c9 movq %r9,%xmm1 7f69: 62 f2 fd 48 59 c1 vpbroadcastq %xmm1,%zmm0 7f6f: 62 f1 fd 48 6f c8 vmovdqa64 %zmm0,%zmm1 7f75: 62 f1 fd 48 6f d0 vmovdqa64 %zmm0,%zmm2 7f7b: 62 f1 fd 48 6f d8 vmovdqa64 %zmm0,%zmm3 7f81: 62 f1 fd 48 6f e0 vmovdqa64 %zmm0,%zmm4 7f87: 62 f1 fd 48 6f e8 vmovdqa64 %zmm0,%zmm5 0000000000007f8d : 7f8d: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7f93: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7f99: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7f9f: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fa5: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fab: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fb1: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fb7: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fbd: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fc3: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fc9: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fcf: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fd5: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fdb: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fe1: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fe7: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fed: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7ff3: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7ff9: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 7fff: 62 f1 fd 48 d4 c0 vpaddq %zmm0,%zmm0,%zmm0 8005: 4c 29 cf sub %r9,%rdi 8008: 0f 85 d5 fe ff ff jne 7ee3 800e: 66 48 0f 7e c8 movq %xmm1,%rax 8013: c5 f8 77 vzeroupper 8016: 41 5b pop %r11 8018: 41 5c pop %r12 801a: 41 5d pop %r13 801c: 41 5e pop %r14 801e: 41 5f pop %r15 8020: 41 58 pop %r8 8022: 41 59 pop %r9 8024: c3 retq 0000000000008025 : 8025: 41 51 push %r9 8027: 41 50 push %r8 8029: 41 57 push %r15 802b: 41 56 push %r14 802d: 41 55 push %r13 802f: 41 54 push %r12 8031: 41 53 push %r11 8033: 49 c7 c1 14 00 00 00 mov $0x14,%r9 803a: 66 49 0f 6e c9 movq %r9,%xmm1 803f: 62 f2 7d 48 58 c1 vpbroadcastd %xmm1,%zmm0 8045: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 804b: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 8051: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 8057: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 805d: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000008063 : 8063: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8069: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 806f: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8075: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 807b: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8081: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8087: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 808d: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8093: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 8099: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 809f: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80a5: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80ab: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80b1: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80b7: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80bd: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80c3: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80c9: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80cf: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80d5: 62 f2 7d 48 40 c0 vpmulld %zmm0,%zmm0,%zmm0 80db: 4c 29 cf sub %r9,%rdi 80de: 75 83 jne 8063 80e0: 66 48 0f 7e c8 movq %xmm1,%rax 80e5: c5 f8 77 vzeroupper 80e8: 41 5b pop %r11 80ea: 41 5c pop %r12 80ec: 41 5d pop %r13 80ee: 41 5e pop %r14 80f0: 41 5f pop %r15 80f2: 41 58 pop %r8 80f4: 41 59 pop %r9 80f6: c3 retq 00000000000080f7 : 80f7: 41 51 push %r9 80f9: 41 50 push %r8 80fb: 41 57 push %r15 80fd: 41 56 push %r14 80ff: 41 55 push %r13 8101: 41 54 push %r12 8103: 41 53 push %r11 8105: 49 c7 c1 14 00 00 00 mov $0x14,%r9 810c: 66 49 0f 6e c9 movq %r9,%xmm1 8111: 62 f2 7d 48 58 c1 vpbroadcastd %xmm1,%zmm0 8117: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 811d: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 8123: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 8129: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 812f: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000008135 : 8135: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 813b: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8141: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8147: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 814d: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8153: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8159: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 815f: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8165: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 816b: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8171: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8177: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 817d: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8183: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8189: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 818f: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 8195: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 819b: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 81a1: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 81a7: 62 f2 fd 48 28 c0 vpmuldq %zmm0,%zmm0,%zmm0 81ad: 4c 29 cf sub %r9,%rdi 81b0: 75 83 jne 8135 81b2: 66 48 0f 7e c8 movq %xmm1,%rax 81b7: c5 f8 77 vzeroupper 81ba: 41 5b pop %r11 81bc: 41 5c pop %r12 81be: 41 5d pop %r13 81c0: 41 5e pop %r14 81c2: 41 5f pop %r15 81c4: 41 58 pop %r8 81c6: 41 59 pop %r9 81c8: c3 retq 00000000000081c9 : 81c9: 41 51 push %r9 81cb: 41 50 push %r8 81cd: 41 57 push %r15 81cf: 41 56 push %r14 81d1: 41 55 push %r13 81d3: 41 54 push %r12 81d5: 41 53 push %r11 81d7: 49 c7 c1 14 00 00 00 mov $0x14,%r9 81de: 66 49 0f 6e c9 movq %r9,%xmm1 81e3: 62 f2 7d 48 58 c1 vpbroadcastd %xmm1,%zmm0 81e9: 62 f1 fe 48 6f c8 vmovdqu64 %zmm0,%zmm1 81ef: 62 f1 fe 48 6f d0 vmovdqu64 %zmm0,%zmm2 81f5: 62 f1 fe 48 6f d8 vmovdqu64 %zmm0,%zmm3 81fb: 62 f1 fe 48 6f e0 vmovdqu64 %zmm0,%zmm4 8201: 62 f1 fe 48 6f e8 vmovdqu64 %zmm0,%zmm5 0000000000008207 : 8207: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 820d: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8213: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8219: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 821f: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8225: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 822b: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8231: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8237: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 823d: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8243: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8249: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 824f: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8255: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 825b: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8261: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8267: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 826d: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8273: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 8279: 62 f2 fd 48 40 c0 vpmullq %zmm0,%zmm0,%zmm0 827f: 4c 29 cf sub %r9,%rdi 8282: 75 83 jne 8207 8284: 66 48 0f 7e c8 movq %xmm1,%rax 8289: c5 f8 77 vzeroupper 828c: 41 5b pop %r11 828e: 41 5c pop %r12 8290: 41 5d pop %r13 8292: 41 5e pop %r14 8294: 41 5f pop %r15 8296: 41 58 pop %r8 8298: 41 59 pop %r9 829a: c3 retq 000000000000829b : 829b: 41 51 push %r9 829d: 41 50 push %r8 829f: 41 57 push %r15 82a1: 41 56 push %r14 82a3: 41 55 push %r13 82a5: 41 54 push %r12 82a7: 41 53 push %r11 82a9: 49 c7 c1 14 00 00 00 mov $0x14,%r9 82b0: 66 49 0f 6e c9 movq %r9,%xmm1 82b5: c5 fe 6f c8 vmovdqu %ymm0,%ymm1 82b9: c5 fe 6f d0 vmovdqu %ymm0,%ymm2 82bd: c5 fe 6f d8 vmovdqu %ymm0,%ymm3 82c1: c5 fe 6f e0 vmovdqu %ymm0,%ymm4 82c5: c5 fe 6f e8 vmovdqu %ymm0,%ymm5 00000000000082c9 : 82c9: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82ce: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82d3: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82d8: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82dd: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82e2: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82e7: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82ec: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82f1: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82f6: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 82fb: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8300: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8305: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 830a: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 830f: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8314: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8319: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 831e: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8323: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 8328: c4 e2 7d 40 c0 vpmulld %ymm0,%ymm0,%ymm0 832d: 4c 29 cf sub %r9,%rdi 8330: 75 97 jne 82c9 8332: 66 48 0f 7e c8 movq %xmm1,%rax 8337: c5 f8 77 vzeroupper 833a: 41 5b pop %r11 833c: 41 5c pop %r12 833e: 41 5d pop %r13 8340: 41 5e pop %r14 8342: 41 5f pop %r15 8344: 41 58 pop %r8 8346: 41 59 pop %r9 8348: c3 retq 0000000000008349 : 8349: 41 51 push %r9 834b: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8352: 66 49 0f 6e c9 movq %r9,%xmm1 0000000000008357 : 8357: 66 0f d4 c0 paddq %xmm0,%xmm0 835b: 66 0f d4 c0 paddq %xmm0,%xmm0 835f: 66 0f d4 c0 paddq %xmm0,%xmm0 8363: 66 0f d4 c0 paddq %xmm0,%xmm0 8367: 66 0f d4 c0 paddq %xmm0,%xmm0 836b: 66 0f d4 c0 paddq %xmm0,%xmm0 836f: 66 0f d4 c0 paddq %xmm0,%xmm0 8373: 66 0f d4 c0 paddq %xmm0,%xmm0 8377: 66 0f d4 c0 paddq %xmm0,%xmm0 837b: 66 0f d4 c0 paddq %xmm0,%xmm0 837f: 66 0f d4 c0 paddq %xmm0,%xmm0 8383: 66 0f d4 c0 paddq %xmm0,%xmm0 8387: 66 0f d4 c0 paddq %xmm0,%xmm0 838b: 66 0f d4 c0 paddq %xmm0,%xmm0 838f: 66 0f d4 c0 paddq %xmm0,%xmm0 8393: 66 0f d4 c0 paddq %xmm0,%xmm0 8397: 66 0f d4 c0 paddq %xmm0,%xmm0 839b: 66 0f d4 c0 paddq %xmm0,%xmm0 839f: 66 0f d4 c0 paddq %xmm0,%xmm0 83a3: 66 0f d4 c0 paddq %xmm0,%xmm0 83a7: 4c 29 cf sub %r9,%rdi 83aa: 75 ab jne 8357 83ac: 66 48 0f 7e c8 movq %xmm1,%rax 83b1: 41 59 pop %r9 83b3: c3 retq 00000000000083b4 : 83b4: 41 51 push %r9 83b6: 49 c7 c1 14 00 00 00 mov $0x14,%r9 83bd: 66 49 0f 6e c9 movq %r9,%xmm1 00000000000083c2 : 83c2: 66 0f d4 c0 paddq %xmm0,%xmm0 83c6: 66 0f d4 c9 paddq %xmm1,%xmm1 83ca: 66 0f d4 d2 paddq %xmm2,%xmm2 83ce: 66 0f d4 db paddq %xmm3,%xmm3 83d2: 66 0f d4 e4 paddq %xmm4,%xmm4 83d6: 66 0f d4 c0 paddq %xmm0,%xmm0 83da: 66 0f d4 c9 paddq %xmm1,%xmm1 83de: 66 0f d4 d2 paddq %xmm2,%xmm2 83e2: 66 0f d4 db paddq %xmm3,%xmm3 83e6: 66 0f d4 e4 paddq %xmm4,%xmm4 83ea: 66 0f d4 c0 paddq %xmm0,%xmm0 83ee: 66 0f d4 c9 paddq %xmm1,%xmm1 83f2: 66 0f d4 d2 paddq %xmm2,%xmm2 83f6: 66 0f d4 db paddq %xmm3,%xmm3 83fa: 66 0f d4 e4 paddq %xmm4,%xmm4 83fe: 66 0f d4 c0 paddq %xmm0,%xmm0 8402: 66 0f d4 c9 paddq %xmm1,%xmm1 8406: 66 0f d4 d2 paddq %xmm2,%xmm2 840a: 66 0f d4 db paddq %xmm3,%xmm3 840e: 66 0f d4 e4 paddq %xmm4,%xmm4 8412: 4c 29 cf sub %r9,%rdi 8415: 75 ab jne 83c2 8417: 66 48 0f 7e c8 movq %xmm1,%rax 841c: 41 59 pop %r9 841e: c3 retq 000000000000841f : 841f: 41 51 push %r9 8421: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8428: 66 49 0f 6e c9 movq %r9,%xmm1 842d: c5 fc 77 vzeroall 8430: 66 0f ef c0 pxor %xmm0,%xmm0 8434: 66 0f ef c9 pxor %xmm1,%xmm1 8438: 66 0f ef d2 pxor %xmm2,%xmm2 843c: 66 0f ef db pxor %xmm3,%xmm3 8440: 66 0f ef e4 pxor %xmm4,%xmm4 8444: 66 0f ef ed pxor %xmm5,%xmm5 0000000000008448 : 8448: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 844d: 66 0f 38 dc d0 aesenc %xmm0,%xmm2 8452: 66 0f 38 dc d8 aesenc %xmm0,%xmm3 8457: 66 0f 38 dc e0 aesenc %xmm0,%xmm4 845c: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 8461: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 8466: 66 0f 38 dc d0 aesenc %xmm0,%xmm2 846b: 66 0f 38 dc d8 aesenc %xmm0,%xmm3 8470: 66 0f 38 dc e0 aesenc %xmm0,%xmm4 8475: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 847a: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 847f: 66 0f 38 dc d0 aesenc %xmm0,%xmm2 8484: 66 0f 38 dc d8 aesenc %xmm0,%xmm3 8489: 66 0f 38 dc e0 aesenc %xmm0,%xmm4 848e: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 8493: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 8498: 66 0f 38 dc d0 aesenc %xmm0,%xmm2 849d: 66 0f 38 dc d8 aesenc %xmm0,%xmm3 84a2: 66 0f 38 dc e0 aesenc %xmm0,%xmm4 84a7: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 84ac: 4c 29 cf sub %r9,%rdi 84af: 75 97 jne 8448 84b1: 66 48 0f 7e c8 movq %xmm1,%rax 84b6: 41 59 pop %r9 84b8: c3 retq 00000000000084b9 : 84b9: 41 51 push %r9 84bb: 49 c7 c1 14 00 00 00 mov $0x14,%r9 84c2: 66 49 0f 6e c9 movq %r9,%xmm1 84c7: c5 fc 77 vzeroall 84ca: 66 0f ef c0 pxor %xmm0,%xmm0 84ce: 66 0f ef c9 pxor %xmm1,%xmm1 84d2: 66 0f ef d2 pxor %xmm2,%xmm2 84d6: 66 0f ef db pxor %xmm3,%xmm3 84da: 66 0f ef e4 pxor %xmm4,%xmm4 84de: 66 0f ef ed pxor %xmm5,%xmm5 84e2: 66 0f ef f6 pxor %xmm6,%xmm6 84e6: 66 0f ef ff pxor %xmm7,%xmm7 84ea: 66 45 0f ef c0 pxor %xmm8,%xmm8 84ef: 66 45 0f ef c9 pxor %xmm9,%xmm9 84f4: 66 45 0f ef d2 pxor %xmm10,%xmm10 84f9: 66 45 0f ef db pxor %xmm11,%xmm11 84fe: 66 45 0f ef e4 pxor %xmm12,%xmm12 8503: 66 45 0f ef ed pxor %xmm13,%xmm13 0000000000008508 : 8508: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 850d: 66 0f fe d6 paddd %xmm6,%xmm2 8511: 66 0f fe de paddd %xmm6,%xmm3 8515: 66 0f fe e6 paddd %xmm6,%xmm4 8519: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 851e: 66 0f fe fe paddd %xmm6,%xmm7 8522: 66 44 0f fe c6 paddd %xmm6,%xmm8 8527: 66 44 0f fe ce paddd %xmm6,%xmm9 852c: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 8532: 66 0f fe d6 paddd %xmm6,%xmm2 8536: 66 0f fe de paddd %xmm6,%xmm3 853a: 66 0f fe e6 paddd %xmm6,%xmm4 853e: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 8543: 66 0f fe fe paddd %xmm6,%xmm7 8547: 66 44 0f fe c6 paddd %xmm6,%xmm8 854c: 66 44 0f fe ce paddd %xmm6,%xmm9 8551: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 8557: 66 44 0f fe de paddd %xmm6,%xmm11 855c: 66 44 0f fe e6 paddd %xmm6,%xmm12 8561: 66 44 0f fe ee paddd %xmm6,%xmm13 8566: 4c 29 cf sub %r9,%rdi 8569: 75 9d jne 8508 856b: 66 48 0f 7e c8 movq %xmm1,%rax 8570: 41 59 pop %r9 8572: c3 retq 0000000000008573 : 8573: 41 51 push %r9 8575: 49 c7 c1 0f 00 00 00 mov $0xf,%r9 857c: 66 49 0f 6e c9 movq %r9,%xmm1 8581: c5 fc 77 vzeroall 8584: 66 0f ef c0 pxor %xmm0,%xmm0 8588: 66 0f ef c9 pxor %xmm1,%xmm1 858c: 0f 57 d2 xorps %xmm2,%xmm2 858f: 0f 57 db xorps %xmm3,%xmm3 8592: 0f 57 e4 xorps %xmm4,%xmm4 8595: 66 0f ef ed pxor %xmm5,%xmm5 8599: 0f 57 f6 xorps %xmm6,%xmm6 859c: 0f 57 ff xorps %xmm7,%xmm7 859f: 45 0f 57 c0 xorps %xmm8,%xmm8 85a3: 45 0f 57 c9 xorps %xmm9,%xmm9 85a7: 66 45 0f ef d2 pxor %xmm10,%xmm10 85ac: 45 0f 57 db xorps %xmm11,%xmm11 85b0: 45 0f 57 e4 xorps %xmm12,%xmm12 85b4: 45 0f 57 ed xorps %xmm13,%xmm13 85b8: 45 0f 57 f6 xorps %xmm14,%xmm14 85bc: 45 0f 57 ff xorps %xmm15,%xmm15 85c0: 62 a1 7c 00 57 c0 vxorps %xmm16,%xmm16,%xmm16 85c6: 62 a1 74 00 57 c9 vxorps %xmm17,%xmm17,%xmm17 85cc: 62 a1 6c 00 57 d2 vxorps %xmm18,%xmm18,%xmm18 85d2: 62 a1 64 00 57 db vxorps %xmm19,%xmm19,%xmm19 00000000000085d8 : 85d8: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 85dd: c4 e2 69 98 d6 vfmadd132ps %xmm6,%xmm2,%xmm2 85e2: c4 e2 61 98 de vfmadd132ps %xmm6,%xmm3,%xmm3 85e7: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 85ec: c4 e2 41 98 fe vfmadd132ps %xmm6,%xmm7,%xmm7 85f1: c4 62 39 98 c6 vfmadd132ps %xmm6,%xmm8,%xmm8 85f6: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 85fc: c4 62 21 98 de vfmadd132ps %xmm6,%xmm11,%xmm11 8601: c4 62 19 98 e6 vfmadd132ps %xmm6,%xmm12,%xmm12 8606: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 860b: c4 62 09 98 f6 vfmadd132ps %xmm6,%xmm14,%xmm14 8610: c4 62 01 98 fe vfmadd132ps %xmm6,%xmm15,%xmm15 8615: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 861b: 62 e2 75 00 98 ce vfmadd132ps %xmm6,%xmm17,%xmm17 8621: 62 e2 6d 00 98 d6 vfmadd132ps %xmm6,%xmm18,%xmm18 8627: 4c 29 cf sub %r9,%rdi 862a: 75 ac jne 85d8 862c: 66 48 0f 7e c8 movq %xmm1,%rax 8631: 41 59 pop %r9 8633: c3 retq 0000000000008634 : 8634: 41 51 push %r9 8636: 49 c7 c1 0f 00 00 00 mov $0xf,%r9 863d: 66 49 0f 6e c9 movq %r9,%xmm1 8642: c5 fc 77 vzeroall 8645: 66 0f ef c0 pxor %xmm0,%xmm0 8649: 66 0f ef c9 pxor %xmm1,%xmm1 864d: 0f 57 d2 xorps %xmm2,%xmm2 8650: 0f 57 db xorps %xmm3,%xmm3 8653: 0f 57 e4 xorps %xmm4,%xmm4 8656: 66 0f ef ed pxor %xmm5,%xmm5 865a: 0f 57 f6 xorps %xmm6,%xmm6 865d: 0f 57 ff xorps %xmm7,%xmm7 8660: 45 0f 57 c0 xorps %xmm8,%xmm8 8664: 45 0f 57 c9 xorps %xmm9,%xmm9 8668: 66 45 0f ef d2 pxor %xmm10,%xmm10 866d: 45 0f 57 db xorps %xmm11,%xmm11 8671: 45 0f 57 e4 xorps %xmm12,%xmm12 8675: 45 0f 57 ed xorps %xmm13,%xmm13 8679: 45 0f 57 f6 xorps %xmm14,%xmm14 867d: 45 0f 57 ff xorps %xmm15,%xmm15 8681: 62 a1 7c 00 57 c0 vxorps %xmm16,%xmm16,%xmm16 8687: 62 a1 74 00 57 c9 vxorps %xmm17,%xmm17,%xmm17 868d: 62 a1 6c 00 57 d2 vxorps %xmm18,%xmm18,%xmm18 8693: 62 a1 64 00 57 db vxorps %xmm19,%xmm19,%xmm19 0000000000008699 : 8699: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 869e: c5 e8 58 d6 vaddps %xmm6,%xmm2,%xmm2 86a2: c5 e0 58 de vaddps %xmm6,%xmm3,%xmm3 86a6: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 86ab: c5 c0 58 fe vaddps %xmm6,%xmm7,%xmm7 86af: c5 38 58 c6 vaddps %xmm6,%xmm8,%xmm8 86b3: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 86b9: c5 20 58 de vaddps %xmm6,%xmm11,%xmm11 86bd: c5 18 58 e6 vaddps %xmm6,%xmm12,%xmm12 86c1: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 86c6: c5 08 58 f6 vaddps %xmm6,%xmm14,%xmm14 86ca: c5 00 58 fe vaddps %xmm6,%xmm15,%xmm15 86ce: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 86d4: 62 e1 74 00 58 ce vaddps %xmm6,%xmm17,%xmm17 86da: 62 e1 6c 00 58 d6 vaddps %xmm6,%xmm18,%xmm18 86e0: 4c 29 cf sub %r9,%rdi 86e3: 7f b4 jg 8699 86e5: 66 48 0f 7e c8 movq %xmm1,%rax 86ea: 41 59 pop %r9 86ec: c3 retq 00000000000086ed : 86ed: 41 51 push %r9 86ef: 49 c7 c1 0f 00 00 00 mov $0xf,%r9 86f6: c5 fc 77 vzeroall 86f9: 66 49 0f 6e f1 movq %r9,%xmm6 86fe: 66 0f ef c0 pxor %xmm0,%xmm0 8702: 66 0f ef ed pxor %xmm5,%xmm5 8706: 66 45 0f ef d2 pxor %xmm10,%xmm10 870b: 0f 57 c9 xorps %xmm1,%xmm1 870e: 0f 57 d2 xorps %xmm2,%xmm2 8711: 0f 57 db xorps %xmm3,%xmm3 8714: 0f 57 e4 xorps %xmm4,%xmm4 8717: 0f 57 ff xorps %xmm7,%xmm7 871a: 45 0f 57 c0 xorps %xmm8,%xmm8 871e: 45 0f 57 db xorps %xmm11,%xmm11 8722: 45 0f 57 e4 xorps %xmm12,%xmm12 8726: 45 0f 57 f6 xorps %xmm14,%xmm14 872a: 45 0f 57 ff xorps %xmm15,%xmm15 000000000000872e : 872e: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 8733: 66 0f d5 d6 pmullw %xmm6,%xmm2 8737: 66 0f d5 de pmullw %xmm6,%xmm3 873b: 66 0f 38 dc e8 aesenc %xmm0,%xmm5 8740: 66 0f d5 fe pmullw %xmm6,%xmm7 8744: 66 44 0f d5 c6 pmullw %xmm6,%xmm8 8749: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 874f: 66 44 0f d5 de pmullw %xmm6,%xmm11 8754: 66 44 0f d5 e6 pmullw %xmm6,%xmm12 8759: 66 0f 38 dc c8 aesenc %xmm0,%xmm1 875e: 66 0f d5 e6 pmullw %xmm6,%xmm4 8762: 66 0f d5 f6 pmullw %xmm6,%xmm6 8766: 66 44 0f 38 dc d0 aesenc %xmm0,%xmm10 876c: 66 44 0f d5 ee pmullw %xmm6,%xmm13 8771: 66 44 0f d5 f6 pmullw %xmm6,%xmm14 8776: 4c 29 cf sub %r9,%rdi 8779: 7f b3 jg 872e 877b: 66 48 0f 7e c8 movq %xmm1,%rax 8780: 41 59 pop %r9 8782: c3 retq 0000000000008783 : 8783: 41 51 push %r9 8785: 49 c7 c1 14 00 00 00 mov $0x14,%r9 878c: 66 49 0f 6e c9 movq %r9,%xmm1 8791: c5 fc 77 vzeroall 8794: 66 0f ef c0 pxor %xmm0,%xmm0 8798: 66 0f ef c9 pxor %xmm1,%xmm1 879c: 66 0f ef d2 pxor %xmm2,%xmm2 87a0: 66 0f ef db pxor %xmm3,%xmm3 87a4: 66 0f ef e4 pxor %xmm4,%xmm4 87a8: 66 0f ef ed pxor %xmm5,%xmm5 00000000000087ac : 87ac: 66 0f 38 de c8 aesdec %xmm0,%xmm1 87b1: 66 0f 38 de d0 aesdec %xmm0,%xmm2 87b6: 66 0f 38 de d8 aesdec %xmm0,%xmm3 87bb: 66 0f 38 de e0 aesdec %xmm0,%xmm4 87c0: 66 0f 38 de e8 aesdec %xmm0,%xmm5 87c5: 66 0f 38 de c8 aesdec %xmm0,%xmm1 87ca: 66 0f 38 de d0 aesdec %xmm0,%xmm2 87cf: 66 0f 38 de d8 aesdec %xmm0,%xmm3 87d4: 66 0f 38 de e0 aesdec %xmm0,%xmm4 87d9: 66 0f 38 de e8 aesdec %xmm0,%xmm5 87de: 66 0f 38 de c8 aesdec %xmm0,%xmm1 87e3: 66 0f 38 de d0 aesdec %xmm0,%xmm2 87e8: 66 0f 38 de d8 aesdec %xmm0,%xmm3 87ed: 66 0f 38 de e0 aesdec %xmm0,%xmm4 87f2: 66 0f 38 de e8 aesdec %xmm0,%xmm5 87f7: 66 0f 38 de c8 aesdec %xmm0,%xmm1 87fc: 66 0f 38 de d0 aesdec %xmm0,%xmm2 8801: 66 0f 38 de d8 aesdec %xmm0,%xmm3 8806: 66 0f 38 de e0 aesdec %xmm0,%xmm4 880b: 66 0f 38 de e8 aesdec %xmm0,%xmm5 8810: 4c 29 cf sub %r9,%rdi 8813: 75 97 jne 87ac 8815: 66 48 0f 7e c8 movq %xmm1,%rax 881a: 41 59 pop %r9 881c: c3 retq 000000000000881d : 881d: 41 51 push %r9 881f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8826: 66 49 0f 6e c9 movq %r9,%xmm1 000000000000882b : 882b: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 8830: 66 0f 38 40 c9 pmulld %xmm1,%xmm1 8835: 66 0f 38 40 d2 pmulld %xmm2,%xmm2 883a: 66 0f 38 40 db pmulld %xmm3,%xmm3 883f: 66 0f 38 40 e4 pmulld %xmm4,%xmm4 8844: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 8849: 66 0f 38 40 c9 pmulld %xmm1,%xmm1 884e: 66 0f 38 40 d2 pmulld %xmm2,%xmm2 8853: 66 0f 38 40 db pmulld %xmm3,%xmm3 8858: 66 0f 38 40 e4 pmulld %xmm4,%xmm4 885d: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 8862: 66 0f 38 40 c9 pmulld %xmm1,%xmm1 8867: 66 0f 38 40 d2 pmulld %xmm2,%xmm2 886c: 66 0f 38 40 db pmulld %xmm3,%xmm3 8871: 66 0f 38 40 e4 pmulld %xmm4,%xmm4 8876: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 887b: 66 0f 38 40 c9 pmulld %xmm1,%xmm1 8880: 66 0f 38 40 d2 pmulld %xmm2,%xmm2 8885: 66 0f 38 40 db pmulld %xmm3,%xmm3 888a: 66 0f 38 40 e4 pmulld %xmm4,%xmm4 888f: 4c 29 cf sub %r9,%rdi 8892: 75 97 jne 882b 8894: 66 48 0f 7e c8 movq %xmm1,%rax 8899: 41 59 pop %r9 889b: c3 retq 000000000000889c : 889c: 41 51 push %r9 889e: 49 c7 c1 14 00 00 00 mov $0x14,%r9 88a5: 66 49 0f 6e c9 movq %r9,%xmm1 00000000000088aa : 88aa: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88af: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88b4: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88b9: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88be: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88c3: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88c8: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88cd: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88d2: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88d7: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88dc: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88e1: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88e6: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88eb: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88f0: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88f5: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88fa: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 88ff: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 8904: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 8909: 66 0f 38 40 c0 pmulld %xmm0,%xmm0 890e: 4c 29 cf sub %r9,%rdi 8911: 75 97 jne 88aa 8913: 66 48 0f 7e c8 movq %xmm1,%rax 8918: 41 59 pop %r9 891a: c3 retq 000000000000891b : 891b: 41 51 push %r9 891d: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8924: 66 49 0f 6e c9 movq %r9,%xmm1 8929: 66 0f 6f c8 movdqa %xmm0,%xmm1 892d: 66 0f 6f d0 movdqa %xmm0,%xmm2 8931: 66 0f 6f d8 movdqa %xmm0,%xmm3 8935: 66 0f 6f e0 movdqa %xmm0,%xmm4 8939: 66 0f 6f e8 movdqa %xmm0,%xmm5 893d: 66 0f 6f f0 movdqa %xmm0,%xmm6 8941: 66 0f 6f f8 movdqa %xmm0,%xmm7 8945: 66 44 0f 6f c0 movdqa %xmm0,%xmm8 894a: 66 44 0f 6f c8 movdqa %xmm0,%xmm9 894f: 66 44 0f 6f d0 movdqa %xmm0,%xmm10 0000000000008954 : 8954: 66 0f 38 40 c8 pmulld %xmm0,%xmm1 8959: 66 0f fe d0 paddd %xmm0,%xmm2 895d: 66 0f 38 40 d8 pmulld %xmm0,%xmm3 8962: 66 0f fe e0 paddd %xmm0,%xmm4 8966: 66 0f 38 40 e8 pmulld %xmm0,%xmm5 896b: 66 0f fe f0 paddd %xmm0,%xmm6 896f: 66 0f 38 40 f8 pmulld %xmm0,%xmm7 8974: 66 44 0f fe c0 paddd %xmm0,%xmm8 8979: 66 44 0f 38 40 c8 pmulld %xmm0,%xmm9 897f: 66 44 0f fe d0 paddd %xmm0,%xmm10 8984: 66 0f 38 40 c8 pmulld %xmm0,%xmm1 8989: 66 0f fe d0 paddd %xmm0,%xmm2 898d: 66 0f 38 40 d8 pmulld %xmm0,%xmm3 8992: 66 0f fe e0 paddd %xmm0,%xmm4 8996: 66 0f 38 40 e8 pmulld %xmm0,%xmm5 899b: 66 0f fe f0 paddd %xmm0,%xmm6 899f: 66 0f 38 40 f8 pmulld %xmm0,%xmm7 89a4: 66 44 0f fe c0 paddd %xmm0,%xmm8 89a9: 66 44 0f 38 40 c8 pmulld %xmm0,%xmm9 89af: 66 44 0f fe d0 paddd %xmm0,%xmm10 89b4: 4c 29 cf sub %r9,%rdi 89b7: 75 9b jne 8954 89b9: 66 48 0f 7e c8 movq %xmm1,%rax 89be: 41 59 pop %r9 89c0: c3 retq 00000000000089c1 : 89c1: 41 51 push %r9 89c3: 41 50 push %r8 89c5: 49 c7 c1 14 00 00 00 mov $0x14,%r9 89cc: 66 49 0f 6e c9 movq %r9,%xmm1 89d1: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 89d6: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 00000000000089db : 89db: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89df: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89e3: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89e7: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89eb: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89ef: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89f3: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89f7: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89fb: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 89ff: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a03: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a07: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a0b: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a0f: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a13: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a17: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a1b: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a1f: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a23: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a27: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 8a2b: 4c 29 cf sub %r9,%rdi 8a2e: 75 ab jne 89db 8a30: 66 48 0f 7e c8 movq %xmm1,%rax 8a35: c5 f8 77 vzeroupper 8a38: 41 58 pop %r8 8a3a: 41 59 pop %r9 8a3c: c3 retq 0000000000008a3d : 8a3d: 41 51 push %r9 8a3f: 41 50 push %r8 8a41: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8a48: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 8a4d: c4 e2 7d 18 f0 vbroadcastss %xmm0,%ymm6 8a52: c5 fd 6f c8 vmovdqa %ymm0,%ymm1 8a56: c5 fd 6f d0 vmovdqa %ymm0,%ymm2 8a5a: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 8a5e: c5 fd 6f e0 vmovdqa %ymm0,%ymm4 8a62: c5 fd 6f e8 vmovdqa %ymm0,%ymm5 8a66: c5 fd 6f f0 vmovdqa %ymm0,%ymm6 8a6a: c5 fd 6f f8 vmovdqa %ymm0,%ymm7 8a6e: c5 7d 6f c0 vmovdqa %ymm0,%ymm8 8a72: c5 7d 6f c8 vmovdqa %ymm0,%ymm9 8a76: c5 7d 6f d0 vmovdqa %ymm0,%ymm10 0000000000008a7a : 8a7a: c5 f4 59 c8 vmulps %ymm0,%ymm1,%ymm1 8a7e: c5 ec 59 d0 vmulps %ymm0,%ymm2,%ymm2 8a82: c5 e4 59 d8 vmulps %ymm0,%ymm3,%ymm3 8a86: c5 dc 59 e0 vmulps %ymm0,%ymm4,%ymm4 8a8a: c5 d4 59 e8 vmulps %ymm0,%ymm5,%ymm5 8a8e: c5 cc 59 f0 vmulps %ymm0,%ymm6,%ymm6 8a92: c5 c4 59 f8 vmulps %ymm0,%ymm7,%ymm7 8a96: c5 3c 59 c0 vmulps %ymm0,%ymm8,%ymm8 8a9a: c5 34 59 c8 vmulps %ymm0,%ymm9,%ymm9 8a9e: c5 2c 59 d0 vmulps %ymm0,%ymm10,%ymm10 8aa2: c5 f4 59 c8 vmulps %ymm0,%ymm1,%ymm1 8aa6: c5 ec 59 d0 vmulps %ymm0,%ymm2,%ymm2 8aaa: c5 e4 59 d8 vmulps %ymm0,%ymm3,%ymm3 8aae: c5 dc 59 e0 vmulps %ymm0,%ymm4,%ymm4 8ab2: c5 d4 59 e8 vmulps %ymm0,%ymm5,%ymm5 8ab6: c5 cc 59 f0 vmulps %ymm0,%ymm6,%ymm6 8aba: c5 c4 59 f8 vmulps %ymm0,%ymm7,%ymm7 8abe: c5 3c 59 c0 vmulps %ymm0,%ymm8,%ymm8 8ac2: c5 34 59 c8 vmulps %ymm0,%ymm9,%ymm9 8ac6: c5 2c 59 d0 vmulps %ymm0,%ymm10,%ymm10 8aca: 4c 29 cf sub %r9,%rdi 8acd: 75 ab jne 8a7a 8acf: 66 48 0f 7e c8 movq %xmm1,%rax 8ad4: c5 f8 77 vzeroupper 8ad7: 41 58 pop %r8 8ad9: 41 59 pop %r9 8adb: c3 retq 0000000000008adc : 8adc: 41 51 push %r9 8ade: 41 50 push %r8 8ae0: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8ae7: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 8aec: c4 e2 7d 18 f0 vbroadcastss %xmm0,%ymm6 8af1: c5 fd 6f c8 vmovdqa %ymm0,%ymm1 8af5: c5 fd 6f d0 vmovdqa %ymm0,%ymm2 8af9: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 8afd: c5 fd 6f e0 vmovdqa %ymm0,%ymm4 8b01: c5 fd 6f e8 vmovdqa %ymm0,%ymm5 8b05: c5 fd 6f f0 vmovdqa %ymm0,%ymm6 8b09: c5 fd 6f f8 vmovdqa %ymm0,%ymm7 8b0d: c5 7d 6f c0 vmovdqa %ymm0,%ymm8 8b11: c5 7d 6f c8 vmovdqa %ymm0,%ymm9 8b15: c5 7d 6f d0 vmovdqa %ymm0,%ymm10 0000000000008b19 : 8b19: c5 f4 58 c8 vaddps %ymm0,%ymm1,%ymm1 8b1d: c5 ec 58 d0 vaddps %ymm0,%ymm2,%ymm2 8b21: c5 e4 58 d8 vaddps %ymm0,%ymm3,%ymm3 8b25: c5 dc 58 e0 vaddps %ymm0,%ymm4,%ymm4 8b29: c5 d4 58 e8 vaddps %ymm0,%ymm5,%ymm5 8b2d: c5 cc 58 f0 vaddps %ymm0,%ymm6,%ymm6 8b31: c5 c4 58 f8 vaddps %ymm0,%ymm7,%ymm7 8b35: c5 3c 58 c0 vaddps %ymm0,%ymm8,%ymm8 8b39: c5 34 58 c8 vaddps %ymm0,%ymm9,%ymm9 8b3d: c5 2c 58 d0 vaddps %ymm0,%ymm10,%ymm10 8b41: c5 f4 58 c8 vaddps %ymm0,%ymm1,%ymm1 8b45: c5 ec 58 d0 vaddps %ymm0,%ymm2,%ymm2 8b49: c5 e4 58 d8 vaddps %ymm0,%ymm3,%ymm3 8b4d: c5 dc 58 e0 vaddps %ymm0,%ymm4,%ymm4 8b51: c5 d4 58 e8 vaddps %ymm0,%ymm5,%ymm5 8b55: c5 cc 58 f0 vaddps %ymm0,%ymm6,%ymm6 8b59: c5 c4 58 f8 vaddps %ymm0,%ymm7,%ymm7 8b5d: c5 3c 58 c0 vaddps %ymm0,%ymm8,%ymm8 8b61: c5 34 58 c8 vaddps %ymm0,%ymm9,%ymm9 8b65: c5 2c 58 d0 vaddps %ymm0,%ymm10,%ymm10 8b69: 4c 29 cf sub %r9,%rdi 8b6c: 75 ab jne 8b19 8b6e: 66 48 0f 7e c8 movq %xmm1,%rax 8b73: c5 f8 77 vzeroupper 8b76: 41 58 pop %r8 8b78: 41 59 pop %r9 8b7a: c3 retq 0000000000008b7b : 8b7b: 41 51 push %r9 8b7d: 41 50 push %r8 8b7f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8b86: 66 49 0f 6e c9 movq %r9,%xmm1 8b8b: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8b90: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 0000000000008b95 : 8b95: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8b99: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8b9d: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8ba1: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8ba5: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8ba9: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bad: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bb1: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bb5: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bb9: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bbd: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bc1: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bc5: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bc9: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bcd: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bd1: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bd5: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bd9: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8bdd: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8be1: c5 cc 59 f6 vmulps %ymm6,%ymm6,%ymm6 8be5: 4c 29 cf sub %r9,%rdi 8be8: 75 ab jne 8b95 8bea: 66 48 0f 7e c8 movq %xmm1,%rax 8bef: c5 f8 77 vzeroupper 8bf2: 41 58 pop %r8 8bf4: 41 59 pop %r9 8bf6: c3 retq 0000000000008bf7 : 8bf7: 41 51 push %r9 8bf9: 41 50 push %r8 8bfb: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8c02: 66 49 0f 6e c9 movq %r9,%xmm1 8c07: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8c0c: 62 f2 7d 48 18 f6 vbroadcastss %xmm6,%zmm6 8c12: 62 f1 7c 48 10 ee vmovups %zmm6,%zmm5 8c18: 62 f1 7c 48 10 fe vmovups %zmm6,%zmm7 8c1e: 62 71 7c 48 10 c6 vmovups %zmm6,%zmm8 8c24: 62 71 7c 48 10 ce vmovups %zmm6,%zmm9 8c2a: 62 71 7c 48 10 d6 vmovups %zmm6,%zmm10 8c30: 62 71 7c 48 10 de vmovups %zmm6,%zmm11 8c36: 62 71 7c 48 10 e6 vmovups %zmm6,%zmm12 8c3c: 62 71 7c 48 10 ee vmovups %zmm6,%zmm13 8c42: 62 71 7c 48 10 f6 vmovups %zmm6,%zmm14 8c48: 62 71 7c 48 10 fe vmovups %zmm6,%zmm15 0000000000008c4e : 8c4e: 62 f2 55 48 98 ee vfmadd132ps %zmm6,%zmm5,%zmm5 8c54: 62 f2 45 48 98 fe vfmadd132ps %zmm6,%zmm7,%zmm7 8c5a: 62 72 3d 48 98 c6 vfmadd132ps %zmm6,%zmm8,%zmm8 8c60: 62 72 35 48 98 ce vfmadd132ps %zmm6,%zmm9,%zmm9 8c66: 62 72 2d 48 98 d6 vfmadd132ps %zmm6,%zmm10,%zmm10 8c6c: 62 72 25 48 98 de vfmadd132ps %zmm6,%zmm11,%zmm11 8c72: 62 72 1d 48 98 e6 vfmadd132ps %zmm6,%zmm12,%zmm12 8c78: 62 72 15 48 98 ee vfmadd132ps %zmm6,%zmm13,%zmm13 8c7e: 62 72 0d 48 98 f6 vfmadd132ps %zmm6,%zmm14,%zmm14 8c84: 62 72 05 48 98 fe vfmadd132ps %zmm6,%zmm15,%zmm15 8c8a: 62 f2 55 48 98 ee vfmadd132ps %zmm6,%zmm5,%zmm5 8c90: 62 f2 45 48 98 fe vfmadd132ps %zmm6,%zmm7,%zmm7 8c96: 62 72 3d 48 98 c6 vfmadd132ps %zmm6,%zmm8,%zmm8 8c9c: 62 72 35 48 98 ce vfmadd132ps %zmm6,%zmm9,%zmm9 8ca2: 62 72 2d 48 98 d6 vfmadd132ps %zmm6,%zmm10,%zmm10 8ca8: 62 72 25 48 98 de vfmadd132ps %zmm6,%zmm11,%zmm11 8cae: 62 72 1d 48 98 e6 vfmadd132ps %zmm6,%zmm12,%zmm12 8cb4: 62 72 15 48 98 ee vfmadd132ps %zmm6,%zmm13,%zmm13 8cba: 62 72 0d 48 98 f6 vfmadd132ps %zmm6,%zmm14,%zmm14 8cc0: 62 72 05 48 98 fe vfmadd132ps %zmm6,%zmm15,%zmm15 8cc6: 4c 29 cf sub %r9,%rdi 8cc9: 75 83 jne 8c4e 8ccb: 66 48 0f 7e c8 movq %xmm1,%rax 8cd0: c5 f8 77 vzeroupper 8cd3: 41 58 pop %r8 8cd5: 41 59 pop %r9 8cd7: c3 retq 0000000000008cd8 : 8cd8: 41 51 push %r9 8cda: 41 50 push %r8 8cdc: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8ce3: 66 49 0f 6e c9 movq %r9,%xmm1 8ce8: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8ced: 62 f2 7d 48 18 f6 vbroadcastss %xmm6,%zmm6 8cf3: 62 f1 7c 48 10 ee vmovups %zmm6,%zmm5 8cf9: 62 f1 7c 48 10 fe vmovups %zmm6,%zmm7 8cff: 62 71 7c 48 10 c6 vmovups %zmm6,%zmm8 8d05: 62 71 7c 48 10 ce vmovups %zmm6,%zmm9 8d0b: 62 71 7c 48 10 d6 vmovups %zmm6,%zmm10 8d11: 62 71 7c 48 10 de vmovups %zmm6,%zmm11 8d17: 62 71 7c 48 10 e6 vmovups %zmm6,%zmm12 8d1d: 62 71 7c 48 10 ee vmovups %zmm6,%zmm13 8d23: 62 71 7c 48 10 f6 vmovups %zmm6,%zmm14 8d29: 62 71 7c 48 10 fe vmovups %zmm6,%zmm15 0000000000008d2f : 8d2f: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8d34: 62 f2 45 48 98 fe vfmadd132ps %zmm6,%zmm7,%zmm7 8d3a: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8d3f: 62 72 35 48 98 ce vfmadd132ps %zmm6,%zmm9,%zmm9 8d45: c4 62 2d 98 d6 vfmadd132ps %ymm6,%ymm10,%ymm10 8d4a: 62 72 25 48 98 de vfmadd132ps %zmm6,%zmm11,%zmm11 8d50: c4 62 1d 98 e6 vfmadd132ps %ymm6,%ymm12,%ymm12 8d55: 62 72 15 48 98 ee vfmadd132ps %zmm6,%zmm13,%zmm13 8d5b: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8d60: 62 72 05 48 98 fe vfmadd132ps %zmm6,%zmm15,%zmm15 8d66: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8d6b: 62 f2 45 48 98 fe vfmadd132ps %zmm6,%zmm7,%zmm7 8d71: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8d76: 62 72 35 48 98 ce vfmadd132ps %zmm6,%zmm9,%zmm9 8d7c: c4 62 2d 98 d6 vfmadd132ps %ymm6,%ymm10,%ymm10 8d81: 62 72 25 48 98 de vfmadd132ps %zmm6,%zmm11,%zmm11 8d87: c4 62 1d 98 e6 vfmadd132ps %ymm6,%ymm12,%ymm12 8d8c: 62 72 15 48 98 ee vfmadd132ps %zmm6,%zmm13,%zmm13 8d92: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8d97: 62 72 05 48 98 fe vfmadd132ps %zmm6,%zmm15,%zmm15 8d9d: 4c 29 cf sub %r9,%rdi 8da0: 75 8d jne 8d2f 8da2: 66 48 0f 7e c8 movq %xmm1,%rax 8da7: c5 f8 77 vzeroupper 8daa: 41 58 pop %r8 8dac: 41 59 pop %r9 8dae: c3 retq 0000000000008daf : 8daf: 41 51 push %r9 8db1: 41 50 push %r8 8db3: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8dba: 66 49 0f 6e c9 movq %r9,%xmm1 8dbf: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8dc4: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 8dc9: c5 fc 10 ee vmovups %ymm6,%ymm5 8dcd: c5 fc 10 fe vmovups %ymm6,%ymm7 8dd1: c5 7c 10 c6 vmovups %ymm6,%ymm8 8dd5: c5 7c 10 ce vmovups %ymm6,%ymm9 8dd9: c5 7c 10 d6 vmovups %ymm6,%ymm10 8ddd: c5 7c 10 de vmovups %ymm6,%ymm11 8de1: c5 7c 10 e6 vmovups %ymm6,%ymm12 8de5: c5 7c 10 ee vmovups %ymm6,%ymm13 8de9: c5 7c 10 f6 vmovups %ymm6,%ymm14 8ded: c5 7c 10 fe vmovups %ymm6,%ymm15 0000000000008df1 : 8df1: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8df6: c4 e2 45 98 fe vfmadd132ps %ymm6,%ymm7,%ymm7 8dfb: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8e00: c4 62 35 98 ce vfmadd132ps %ymm6,%ymm9,%ymm9 8e05: c4 62 2d 98 d6 vfmadd132ps %ymm6,%ymm10,%ymm10 8e0a: c4 62 25 98 de vfmadd132ps %ymm6,%ymm11,%ymm11 8e0f: c4 62 1d 98 e6 vfmadd132ps %ymm6,%ymm12,%ymm12 8e14: c4 62 15 98 ee vfmadd132ps %ymm6,%ymm13,%ymm13 8e19: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8e1e: c4 62 05 98 fe vfmadd132ps %ymm6,%ymm15,%ymm15 8e23: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8e28: c4 e2 45 98 fe vfmadd132ps %ymm6,%ymm7,%ymm7 8e2d: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8e32: c4 62 35 98 ce vfmadd132ps %ymm6,%ymm9,%ymm9 8e37: c4 62 2d 98 d6 vfmadd132ps %ymm6,%ymm10,%ymm10 8e3c: c4 62 25 98 de vfmadd132ps %ymm6,%ymm11,%ymm11 8e41: c4 62 1d 98 e6 vfmadd132ps %ymm6,%ymm12,%ymm12 8e46: c4 62 15 98 ee vfmadd132ps %ymm6,%ymm13,%ymm13 8e4b: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8e50: c4 62 05 98 fe vfmadd132ps %ymm6,%ymm15,%ymm15 8e55: 4c 29 cf sub %r9,%rdi 8e58: 75 97 jne 8df1 8e5a: 66 48 0f 7e c8 movq %xmm1,%rax 8e5f: c5 f8 77 vzeroupper 8e62: 41 58 pop %r8 8e64: 41 59 pop %r9 8e66: c3 retq 0000000000008e67 : 8e67: 41 51 push %r9 8e69: 41 50 push %r8 8e6b: c5 f8 77 vzeroupper 8e6e: 49 c7 c1 14 00 00 00 mov $0x14,%r9 8e75: 66 49 0f 6e c9 movq %r9,%xmm1 8e7a: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8e7f: c4 e2 79 18 f6 vbroadcastss %xmm6,%xmm6 8e84: c5 f8 10 ee vmovups %xmm6,%xmm5 8e88: c5 f8 10 fe vmovups %xmm6,%xmm7 8e8c: c5 78 10 c6 vmovups %xmm6,%xmm8 8e90: c5 78 10 ce vmovups %xmm6,%xmm9 8e94: c5 78 10 d6 vmovups %xmm6,%xmm10 8e98: c5 78 10 de vmovups %xmm6,%xmm11 8e9c: c5 78 10 e6 vmovups %xmm6,%xmm12 8ea0: c5 78 10 ee vmovups %xmm6,%xmm13 8ea4: c5 78 10 f6 vmovups %xmm6,%xmm14 8ea8: c5 78 10 fe vmovups %xmm6,%xmm15 0000000000008eac : 8eac: c4 e2 51 98 ee vfmadd132ps %xmm6,%xmm5,%xmm5 8eb1: c4 e2 41 98 fe vfmadd132ps %xmm6,%xmm7,%xmm7 8eb6: c4 62 39 98 c6 vfmadd132ps %xmm6,%xmm8,%xmm8 8ebb: c4 62 31 98 ce vfmadd132ps %xmm6,%xmm9,%xmm9 8ec0: c4 62 29 98 d6 vfmadd132ps %xmm6,%xmm10,%xmm10 8ec5: c4 62 21 98 de vfmadd132ps %xmm6,%xmm11,%xmm11 8eca: c4 62 19 98 e6 vfmadd132ps %xmm6,%xmm12,%xmm12 8ecf: c4 62 11 98 ee vfmadd132ps %xmm6,%xmm13,%xmm13 8ed4: c4 62 09 98 f6 vfmadd132ps %xmm6,%xmm14,%xmm14 8ed9: c4 62 01 98 fe vfmadd132ps %xmm6,%xmm15,%xmm15 8ede: c4 e2 51 98 ee vfmadd132ps %xmm6,%xmm5,%xmm5 8ee3: c4 e2 41 98 fe vfmadd132ps %xmm6,%xmm7,%xmm7 8ee8: c4 62 39 98 c6 vfmadd132ps %xmm6,%xmm8,%xmm8 8eed: c4 62 31 98 ce vfmadd132ps %xmm6,%xmm9,%xmm9 8ef2: c4 62 29 98 d6 vfmadd132ps %xmm6,%xmm10,%xmm10 8ef7: c4 62 21 98 de vfmadd132ps %xmm6,%xmm11,%xmm11 8efc: c4 62 19 98 e6 vfmadd132ps %xmm6,%xmm12,%xmm12 8f01: c4 62 11 98 ee vfmadd132ps %xmm6,%xmm13,%xmm13 8f06: c4 62 09 98 f6 vfmadd132ps %xmm6,%xmm14,%xmm14 8f0b: c4 62 01 98 fe vfmadd132ps %xmm6,%xmm15,%xmm15 8f10: 4c 29 cf sub %r9,%rdi 8f13: 75 97 jne 8eac 8f15: 66 48 0f 7e c8 movq %xmm1,%rax 8f1a: c5 f8 77 vzeroupper 8f1d: 41 58 pop %r8 8f1f: 41 59 pop %r9 8f21: c3 retq 0000000000008f22 : 8f22: 41 51 push %r9 8f24: 41 50 push %r8 8f26: 49 c7 c1 1e 00 00 00 mov $0x1e,%r9 8f2d: 66 49 0f 6e c9 movq %r9,%xmm1 8f32: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 8f37: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 8f3c: c5 fc 10 c6 vmovups %ymm6,%ymm0 8f40: c5 fc 10 ce vmovups %ymm6,%ymm1 8f44: c5 fc 10 d6 vmovups %ymm6,%ymm2 8f48: c5 fc 10 de vmovups %ymm6,%ymm3 8f4c: c5 fc 10 e6 vmovups %ymm6,%ymm4 8f50: c5 fc 10 ee vmovups %ymm6,%ymm5 8f54: c5 fc 10 fe vmovups %ymm6,%ymm7 8f58: c5 7c 10 c6 vmovups %ymm6,%ymm8 8f5c: c5 7c 10 ce vmovups %ymm6,%ymm9 8f60: c5 7c 10 d6 vmovups %ymm6,%ymm10 8f64: c5 7c 10 de vmovups %ymm6,%ymm11 8f68: c5 7c 10 e6 vmovups %ymm6,%ymm12 8f6c: c5 7c 10 ee vmovups %ymm6,%ymm13 8f70: c5 7c 10 f6 vmovups %ymm6,%ymm14 8f74: c5 7c 10 fe vmovups %ymm6,%ymm15 0000000000008f78 : 8f78: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8f7d: c4 e2 45 98 fe vfmadd132ps %ymm6,%ymm7,%ymm7 8f82: c4 41 54 58 da vaddps %ymm10,%ymm5,%ymm11 8f87: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8f8c: c4 62 35 98 ce vfmadd132ps %ymm6,%ymm9,%ymm9 8f91: c4 41 54 58 ec vaddps %ymm12,%ymm5,%ymm13 8f96: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8f9b: c4 62 05 98 fe vfmadd132ps %ymm6,%ymm15,%ymm15 8fa0: c4 41 4c 58 ec vaddps %ymm12,%ymm6,%ymm13 8fa5: c4 e2 7d 98 ce vfmadd132ps %ymm6,%ymm0,%ymm1 8faa: c4 e2 6d 98 de vfmadd132ps %ymm6,%ymm2,%ymm3 8faf: c5 d4 58 e6 vaddps %ymm6,%ymm5,%ymm4 8fb3: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8fb8: c4 e2 45 98 fe vfmadd132ps %ymm6,%ymm7,%ymm7 8fbd: c4 41 4c 58 da vaddps %ymm10,%ymm6,%ymm11 8fc2: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 8fc7: c4 62 35 98 ce vfmadd132ps %ymm6,%ymm9,%ymm9 8fcc: c4 41 44 58 ec vaddps %ymm12,%ymm7,%ymm13 8fd1: c4 62 0d 98 f6 vfmadd132ps %ymm6,%ymm14,%ymm14 8fd6: c4 62 05 98 fe vfmadd132ps %ymm6,%ymm15,%ymm15 8fdb: c4 41 54 58 ec vaddps %ymm12,%ymm5,%ymm13 8fe0: c4 e2 7d 98 ce vfmadd132ps %ymm6,%ymm0,%ymm1 8fe5: c4 e2 6d 98 de vfmadd132ps %ymm6,%ymm2,%ymm3 8fea: c5 d4 58 e6 vaddps %ymm6,%ymm5,%ymm4 8fee: c4 e2 55 98 ee vfmadd132ps %ymm6,%ymm5,%ymm5 8ff3: c4 e2 45 98 fe vfmadd132ps %ymm6,%ymm7,%ymm7 8ff8: c4 41 4c 58 da vaddps %ymm10,%ymm6,%ymm11 8ffd: c4 62 3d 98 c6 vfmadd132ps %ymm6,%ymm8,%ymm8 9002: c4 62 35 98 ce vfmadd132ps %ymm6,%ymm9,%ymm9 9007: c4 41 54 58 ec vaddps %ymm12,%ymm5,%ymm13 900c: 4c 29 cf sub %r9,%rdi 900f: 0f 85 63 ff ff ff jne 8f78 9015: 66 48 0f 7e c8 movq %xmm1,%rax 901a: c5 f8 77 vzeroupper 901d: 41 58 pop %r8 901f: 41 59 pop %r9 9021: c3 retq 0000000000009022 : 9022: 41 51 push %r9 9024: 41 50 push %r8 9026: 49 c7 c1 10 00 00 00 mov $0x10,%r9 902d: 66 49 0f 6e c1 movq %r9,%xmm0 9032: 62 f2 fd 48 59 c0 vpbroadcastq %xmm0,%zmm0 9038: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 903d: 62 f2 7d 48 18 c9 vbroadcastss %xmm1,%zmm1 9043: 62 f1 fd 48 6f d8 vmovdqa64 %zmm0,%zmm3 9049: 62 f1 fd 48 6f f0 vmovdqa64 %zmm0,%zmm6 904f: 62 71 fd 48 6f c8 vmovdqa64 %zmm0,%zmm9 9055: 62 71 fd 48 6f e0 vmovdqa64 %zmm0,%zmm12 905b: 62 71 fd 48 6f f8 vmovdqa64 %zmm0,%zmm15 9061: 62 f1 7c 48 28 d1 vmovaps %zmm1,%zmm2 9067: 62 f1 7c 48 28 e1 vmovaps %zmm1,%zmm4 906d: 62 f1 7c 48 28 e9 vmovaps %zmm1,%zmm5 9073: 62 f1 7c 48 28 f9 vmovaps %zmm1,%zmm7 9079: 62 71 7c 48 28 c1 vmovaps %zmm1,%zmm8 907f: 62 71 7c 48 28 d1 vmovaps %zmm1,%zmm10 9085: 62 71 7c 48 28 d9 vmovaps %zmm1,%zmm11 908b: 62 71 7c 48 28 e9 vmovaps %zmm1,%zmm13 9091: 62 71 7c 48 28 f1 vmovaps %zmm1,%zmm14 0000000000009097 : 9097: 62 f1 85 48 d4 c0 vpaddq %zmm0,%zmm15,%zmm0 909d: 62 f2 75 48 98 c9 vfmadd132ps %zmm1,%zmm1,%zmm1 90a3: 62 f2 6d 48 98 d2 vfmadd132ps %zmm2,%zmm2,%zmm2 90a9: 62 f1 85 48 d4 db vpaddq %zmm3,%zmm15,%zmm3 90af: 62 f2 5d 48 98 e4 vfmadd132ps %zmm4,%zmm4,%zmm4 90b5: 62 f2 55 48 98 ed vfmadd132ps %zmm5,%zmm5,%zmm5 90bb: 62 f1 85 48 d4 f6 vpaddq %zmm6,%zmm15,%zmm6 90c1: 62 f2 45 48 98 ff vfmadd132ps %zmm7,%zmm7,%zmm7 90c7: 62 52 3d 48 98 c0 vfmadd132ps %zmm8,%zmm8,%zmm8 90cd: 62 51 85 48 d4 c9 vpaddq %zmm9,%zmm15,%zmm9 90d3: 62 52 2d 48 98 d2 vfmadd132ps %zmm10,%zmm10,%zmm10 90d9: 62 52 25 48 98 db vfmadd132ps %zmm11,%zmm11,%zmm11 90df: 62 51 85 48 d4 e4 vpaddq %zmm12,%zmm15,%zmm12 90e5: 62 52 15 48 98 ed vfmadd132ps %zmm13,%zmm13,%zmm13 90eb: 62 52 0d 48 98 f6 vfmadd132ps %zmm14,%zmm14,%zmm14 90f1: 4c 29 cf sub %r9,%rdi 90f4: 7f a1 jg 9097 90f6: 66 48 0f 7e c8 movq %xmm1,%rax 90fb: c5 f8 77 vzeroupper 90fe: 41 58 pop %r8 9100: 41 59 pop %r9 9102: c3 retq 0000000000009103 : 9103: 41 51 push %r9 9105: 41 50 push %r8 9107: 49 c7 c1 10 00 00 00 mov $0x10,%r9 910e: 66 49 0f 6e c1 movq %r9,%xmm0 9113: c4 e2 7d 59 c0 vpbroadcastq %xmm0,%ymm0 9118: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 911d: 62 f2 7d 48 18 c9 vbroadcastss %xmm1,%zmm1 9123: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 9127: c5 fd 6f f0 vmovdqa %ymm0,%ymm6 912b: c5 7d 6f c8 vmovdqa %ymm0,%ymm9 912f: c5 7d 6f e0 vmovdqa %ymm0,%ymm12 9133: c5 7d 6f f8 vmovdqa %ymm0,%ymm15 9137: 62 f1 7c 48 28 d1 vmovaps %zmm1,%zmm2 913d: 62 f1 7c 48 28 e1 vmovaps %zmm1,%zmm4 9143: 62 f1 7c 48 28 e9 vmovaps %zmm1,%zmm5 9149: 62 f1 7c 48 28 f9 vmovaps %zmm1,%zmm7 914f: 62 71 7c 48 28 c1 vmovaps %zmm1,%zmm8 9155: 62 71 7c 48 28 d1 vmovaps %zmm1,%zmm10 915b: 62 71 7c 48 28 d9 vmovaps %zmm1,%zmm11 9161: 62 71 7c 48 28 e9 vmovaps %zmm1,%zmm13 9167: 62 71 7c 48 28 f1 vmovaps %zmm1,%zmm14 000000000000916d : 916d: c5 85 d4 c0 vpaddq %ymm0,%ymm15,%ymm0 9171: 62 f2 75 48 98 c9 vfmadd132ps %zmm1,%zmm1,%zmm1 9177: 62 f2 6d 48 98 d2 vfmadd132ps %zmm2,%zmm2,%zmm2 917d: c5 85 d4 db vpaddq %ymm3,%ymm15,%ymm3 9181: 62 f2 5d 48 98 e4 vfmadd132ps %zmm4,%zmm4,%zmm4 9187: 62 f2 55 48 98 ed vfmadd132ps %zmm5,%zmm5,%zmm5 918d: c5 85 d4 f6 vpaddq %ymm6,%ymm15,%ymm6 9191: 62 f2 45 48 98 ff vfmadd132ps %zmm7,%zmm7,%zmm7 9197: 62 52 3d 48 98 c0 vfmadd132ps %zmm8,%zmm8,%zmm8 919d: c4 41 05 d4 c9 vpaddq %ymm9,%ymm15,%ymm9 91a2: 62 52 2d 48 98 d2 vfmadd132ps %zmm10,%zmm10,%zmm10 91a8: 62 52 25 48 98 db vfmadd132ps %zmm11,%zmm11,%zmm11 91ae: c4 41 05 d4 e4 vpaddq %ymm12,%ymm15,%ymm12 91b3: 62 52 15 48 98 ed vfmadd132ps %zmm13,%zmm13,%zmm13 91b9: 62 52 0d 48 98 f6 vfmadd132ps %zmm14,%zmm14,%zmm14 91bf: 4c 29 cf sub %r9,%rdi 91c2: 7f a9 jg 916d 91c4: 66 48 0f 7e c8 movq %xmm1,%rax 91c9: c5 f8 77 vzeroupper 91cc: 41 58 pop %r8 91ce: 41 59 pop %r9 91d0: c3 retq 00000000000091d1 : 91d1: 41 51 push %r9 91d3: 41 50 push %r8 91d5: 49 c7 c1 10 00 00 00 mov $0x10,%r9 91dc: 66 49 0f 6e c1 movq %r9,%xmm0 91e1: c4 e2 7d 59 c0 vpbroadcastq %xmm0,%ymm0 91e6: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 91eb: c4 e2 7d 18 c9 vbroadcastss %xmm1,%ymm1 91f0: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 91f4: c5 fd 6f f0 vmovdqa %ymm0,%ymm6 91f8: c5 7d 6f c8 vmovdqa %ymm0,%ymm9 91fc: c5 7d 6f e0 vmovdqa %ymm0,%ymm12 9200: c5 7d 6f f8 vmovdqa %ymm0,%ymm15 9204: c5 fc 28 d1 vmovaps %ymm1,%ymm2 9208: c5 fc 28 e1 vmovaps %ymm1,%ymm4 920c: c5 fc 28 e9 vmovaps %ymm1,%ymm5 9210: c5 fc 28 f9 vmovaps %ymm1,%ymm7 9214: c5 7c 28 c1 vmovaps %ymm1,%ymm8 9218: c5 7c 28 d1 vmovaps %ymm1,%ymm10 921c: c5 7c 28 d9 vmovaps %ymm1,%ymm11 9220: c5 7c 28 e9 vmovaps %ymm1,%ymm13 9224: c5 7c 28 f1 vmovaps %ymm1,%ymm14 0000000000009228 : 9228: c5 85 d4 c0 vpaddq %ymm0,%ymm15,%ymm0 922c: c4 e2 75 98 c9 vfmadd132ps %ymm1,%ymm1,%ymm1 9231: c4 e2 6d 98 d2 vfmadd132ps %ymm2,%ymm2,%ymm2 9236: c5 85 d4 db vpaddq %ymm3,%ymm15,%ymm3 923a: c4 e2 5d 98 e4 vfmadd132ps %ymm4,%ymm4,%ymm4 923f: c4 e2 55 98 ed vfmadd132ps %ymm5,%ymm5,%ymm5 9244: c5 85 d4 f6 vpaddq %ymm6,%ymm15,%ymm6 9248: c4 e2 45 98 ff vfmadd132ps %ymm7,%ymm7,%ymm7 924d: c4 42 3d 98 c0 vfmadd132ps %ymm8,%ymm8,%ymm8 9252: c4 41 05 d4 c9 vpaddq %ymm9,%ymm15,%ymm9 9257: c4 42 2d 98 d2 vfmadd132ps %ymm10,%ymm10,%ymm10 925c: c4 42 25 98 db vfmadd132ps %ymm11,%ymm11,%ymm11 9261: c4 41 05 d4 e4 vpaddq %ymm12,%ymm15,%ymm12 9266: c4 42 15 98 ed vfmadd132ps %ymm13,%ymm13,%ymm13 926b: c4 42 0d 98 f6 vfmadd132ps %ymm14,%ymm14,%ymm14 9270: 4c 29 cf sub %r9,%rdi 9273: 7f b3 jg 9228 9275: 66 48 0f 7e c8 movq %xmm1,%rax 927a: c5 f8 77 vzeroupper 927d: 41 58 pop %r8 927f: 41 59 pop %r9 9281: c3 retq 0000000000009282 : 9282: 41 51 push %r9 9284: 41 50 push %r8 9286: 49 c7 c1 0f 00 00 00 mov $0xf,%r9 928d: 66 49 0f 6e c1 movq %r9,%xmm0 9292: c4 e2 7d 59 c0 vpbroadcastq %xmm0,%ymm0 9297: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 929c: c4 e2 7d 18 c9 vbroadcastss %xmm1,%ymm1 92a1: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 92a5: c5 fd 6f f0 vmovdqa %ymm0,%ymm6 92a9: c5 7d 6f c8 vmovdqa %ymm0,%ymm9 92ad: c5 7d 6f e0 vmovdqa %ymm0,%ymm12 92b1: c5 7d 6f f8 vmovdqa %ymm0,%ymm15 92b5: c5 fc 28 d1 vmovaps %ymm1,%ymm2 92b9: c5 fc 28 e1 vmovaps %ymm1,%ymm4 92bd: c5 fc 28 e9 vmovaps %ymm1,%ymm5 92c1: c5 fc 28 f9 vmovaps %ymm1,%ymm7 92c5: c5 7c 28 c1 vmovaps %ymm1,%ymm8 92c9: c5 7c 28 d1 vmovaps %ymm1,%ymm10 92cd: c5 7c 28 d9 vmovaps %ymm1,%ymm11 92d1: c5 7c 28 e9 vmovaps %ymm1,%ymm13 92d5: c5 7c 28 f1 vmovaps %ymm1,%ymm14 00000000000092d9 : 92d9: c5 85 db c0 vpand %ymm0,%ymm15,%ymm0 92dd: c4 e2 75 98 c9 vfmadd132ps %ymm1,%ymm1,%ymm1 92e2: c4 e2 6d 98 d2 vfmadd132ps %ymm2,%ymm2,%ymm2 92e7: c5 85 db db vpand %ymm3,%ymm15,%ymm3 92eb: c4 e2 5d 98 e4 vfmadd132ps %ymm4,%ymm4,%ymm4 92f0: c4 e2 55 98 ed vfmadd132ps %ymm5,%ymm5,%ymm5 92f5: c5 85 db f6 vpand %ymm6,%ymm15,%ymm6 92f9: c4 e2 45 98 ff vfmadd132ps %ymm7,%ymm7,%ymm7 92fe: c4 42 3d 98 c0 vfmadd132ps %ymm8,%ymm8,%ymm8 9303: c4 41 05 db c9 vpand %ymm9,%ymm15,%ymm9 9308: c4 42 2d 98 d2 vfmadd132ps %ymm10,%ymm10,%ymm10 930d: c4 42 25 98 db vfmadd132ps %ymm11,%ymm11,%ymm11 9312: c4 41 05 db e4 vpand %ymm12,%ymm15,%ymm12 9317: c4 42 15 98 ed vfmadd132ps %ymm13,%ymm13,%ymm13 931c: c4 42 0d 98 f6 vfmadd132ps %ymm14,%ymm14,%ymm14 9321: 4c 29 cf sub %r9,%rdi 9324: 7f b3 jg 92d9 9326: 66 48 0f 7e c8 movq %xmm1,%rax 932b: c5 f8 77 vzeroupper 932e: 41 58 pop %r8 9330: 41 59 pop %r9 9332: c3 retq 0000000000009333 : 9333: 41 51 push %r9 9335: 41 50 push %r8 9337: 49 c7 c1 16 00 00 00 mov $0x16,%r9 933e: 66 49 0f 6e c1 movq %r9,%xmm0 9343: c4 e2 7d 59 c0 vpbroadcastq %xmm0,%ymm0 9348: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 934d: c4 e2 7d 18 c9 vbroadcastss %xmm1,%ymm1 9352: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 9356: c5 fc 28 f1 vmovaps %ymm1,%ymm6 935a: c5 7c 28 c9 vmovaps %ymm1,%ymm9 935e: c5 7c 28 e1 vmovaps %ymm1,%ymm12 9362: c5 7c 28 f9 vmovaps %ymm1,%ymm15 9366: c5 fc 28 d1 vmovaps %ymm1,%ymm2 936a: c5 fc 28 e1 vmovaps %ymm1,%ymm4 936e: c5 fc 28 e9 vmovaps %ymm1,%ymm5 9372: c5 fc 28 f9 vmovaps %ymm1,%ymm7 9376: c5 7c 28 c1 vmovaps %ymm1,%ymm8 937a: c5 7c 28 d1 vmovaps %ymm1,%ymm10 937e: c5 7c 28 d9 vmovaps %ymm1,%ymm11 9382: c5 7c 28 e9 vmovaps %ymm1,%ymm13 9386: c5 7c 28 f1 vmovaps %ymm1,%ymm14 000000000000938a : 938a: c5 fd db c0 vpand %ymm0,%ymm0,%ymm0 938e: c4 e2 75 98 c9 vfmadd132ps %ymm1,%ymm1,%ymm1 9393: c4 e2 6d 98 16 vfmadd132ps (%rsi),%ymm2,%ymm2 9398: c5 e5 db db vpand %ymm3,%ymm3,%ymm3 939c: c4 e2 5d 98 e4 vfmadd132ps %ymm4,%ymm4,%ymm4 93a1: c4 e2 55 98 2e vfmadd132ps (%rsi),%ymm5,%ymm5 93a6: c5 fd db c0 vpand %ymm0,%ymm0,%ymm0 93aa: c4 e2 45 98 ff vfmadd132ps %ymm7,%ymm7,%ymm7 93af: c4 62 3d 98 06 vfmadd132ps (%rsi),%ymm8,%ymm8 93b4: c5 e5 db db vpand %ymm3,%ymm3,%ymm3 93b8: c4 42 2d 98 d2 vfmadd132ps %ymm10,%ymm10,%ymm10 93bd: c4 62 25 98 1e vfmadd132ps (%rsi),%ymm11,%ymm11 93c2: c5 fd db c0 vpand %ymm0,%ymm0,%ymm0 93c6: c4 42 15 98 ed vfmadd132ps %ymm13,%ymm13,%ymm13 93cb: c4 62 0d 98 36 vfmadd132ps (%rsi),%ymm14,%ymm14 93d0: c5 e5 db db vpand %ymm3,%ymm3,%ymm3 93d4: c4 e2 4d 98 f6 vfmadd132ps %ymm6,%ymm6,%ymm6 93d9: c4 62 35 98 0e vfmadd132ps (%rsi),%ymm9,%ymm9 93de: c5 fd db c0 vpand %ymm0,%ymm0,%ymm0 93e2: c4 42 1d 98 e4 vfmadd132ps %ymm12,%ymm12,%ymm12 93e7: c4 62 05 98 3e vfmadd132ps (%rsi),%ymm15,%ymm15 93ec: 4c 29 cf sub %r9,%rdi 93ef: 7f 99 jg 938a 93f1: 66 48 0f 7e c8 movq %xmm1,%rax 93f6: c5 f8 77 vzeroupper 93f9: 41 58 pop %r8 93fb: 41 59 pop %r9 93fd: c3 retq 00000000000093fe : 93fe: 41 51 push %r9 9400: 41 50 push %r8 9402: 49 c7 c1 16 00 00 00 mov $0x16,%r9 9409: 66 49 0f 6e c1 movq %r9,%xmm0 940e: c4 e2 7d 59 c0 vpbroadcastq %xmm0,%ymm0 9413: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 9418: c4 e2 7d 18 c9 vbroadcastss %xmm1,%ymm1 941d: c5 fd 6f d8 vmovdqa %ymm0,%ymm3 9421: c5 fc 28 f1 vmovaps %ymm1,%ymm6 9425: c5 7c 28 c9 vmovaps %ymm1,%ymm9 9429: c5 7c 28 e1 vmovaps %ymm1,%ymm12 942d: c5 7c 28 f9 vmovaps %ymm1,%ymm15 9431: c5 fc 28 d1 vmovaps %ymm1,%ymm2 9435: c5 fc 28 e1 vmovaps %ymm1,%ymm4 9439: c5 fc 28 e9 vmovaps %ymm1,%ymm5 943d: c5 fc 28 f9 vmovaps %ymm1,%ymm7 9441: c5 7c 28 c1 vmovaps %ymm1,%ymm8 9445: c5 7c 28 d1 vmovaps %ymm1,%ymm10 9449: c5 7c 28 d9 vmovaps %ymm1,%ymm11 944d: c5 7c 28 e9 vmovaps %ymm1,%ymm13 9451: c5 7c 28 f1 vmovaps %ymm1,%ymm14 0000000000009455 : 9455: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 9459: c4 e2 75 98 c9 vfmadd132ps %ymm1,%ymm1,%ymm1 945e: c4 e2 6d 98 16 vfmadd132ps (%rsi),%ymm2,%ymm2 9463: c5 e5 d4 db vpaddq %ymm3,%ymm3,%ymm3 9467: c4 e2 5d 98 e4 vfmadd132ps %ymm4,%ymm4,%ymm4 946c: c4 e2 55 98 2e vfmadd132ps (%rsi),%ymm5,%ymm5 9471: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 9475: c4 e2 45 98 ff vfmadd132ps %ymm7,%ymm7,%ymm7 947a: c4 62 3d 98 06 vfmadd132ps (%rsi),%ymm8,%ymm8 947f: c5 e5 d4 db vpaddq %ymm3,%ymm3,%ymm3 9483: c4 42 2d 98 d2 vfmadd132ps %ymm10,%ymm10,%ymm10 9488: c4 62 25 98 1e vfmadd132ps (%rsi),%ymm11,%ymm11 948d: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 9491: c4 42 15 98 ed vfmadd132ps %ymm13,%ymm13,%ymm13 9496: c4 62 0d 98 36 vfmadd132ps (%rsi),%ymm14,%ymm14 949b: c5 e5 d4 db vpaddq %ymm3,%ymm3,%ymm3 949f: c4 e2 4d 98 f6 vfmadd132ps %ymm6,%ymm6,%ymm6 94a4: c4 62 35 98 0e vfmadd132ps (%rsi),%ymm9,%ymm9 94a9: c5 fd d4 c0 vpaddq %ymm0,%ymm0,%ymm0 94ad: c4 42 1d 98 e4 vfmadd132ps %ymm12,%ymm12,%ymm12 94b2: c4 62 05 98 3e vfmadd132ps (%rsi),%ymm15,%ymm15 94b7: 4c 29 cf sub %r9,%rdi 94ba: 7f 99 jg 9455 94bc: 66 48 0f 7e c8 movq %xmm1,%rax 94c1: c5 f8 77 vzeroupper 94c4: 41 58 pop %r8 94c6: 41 59 pop %r9 94c8: c3 retq 00000000000094c9 : 94c9: 41 51 push %r9 94cb: 49 c7 c1 10 00 00 00 mov $0x10,%r9 94d2: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 94d7: 62 f2 7d 48 18 c8 vbroadcastss %xmm0,%zmm1 94dd: 62 f1 fd 48 6f d1 vmovdqa64 %zmm1,%zmm2 94e3: 62 f1 fd 48 6f d9 vmovdqa64 %zmm1,%zmm3 94e9: 62 f1 fd 48 6f e1 vmovdqa64 %zmm1,%zmm4 94ef: 62 f1 fd 48 6f e9 vmovdqa64 %zmm1,%zmm5 94f5: 62 f1 fd 48 6f f1 vmovdqa64 %zmm1,%zmm6 94fb: 62 f1 fd 48 6f f9 vmovdqa64 %zmm1,%zmm7 9501: 62 71 fd 48 6f c1 vmovdqa64 %zmm1,%zmm8 9507: 62 71 fd 48 6f c9 vmovdqa64 %zmm1,%zmm9 950d: 62 71 fd 48 6f d1 vmovdqa64 %zmm1,%zmm10 9513: 62 71 fd 48 6f d9 vmovdqa64 %zmm1,%zmm11 9519: 62 71 fd 48 6f e1 vmovdqa64 %zmm1,%zmm12 951f: 62 71 fd 48 6f e9 vmovdqa64 %zmm1,%zmm13 9525: 62 71 fd 48 6f f1 vmovdqa64 %zmm1,%zmm14 952b: 62 71 fd 48 6f f9 vmovdqa64 %zmm1,%zmm15 0000000000009531 : 9531: 62 f1 7c 48 58 c0 vaddps %zmm0,%zmm0,%zmm0 9537: 62 f2 75 48 98 c9 vfmadd132ps %zmm1,%zmm1,%zmm1 953d: 62 f2 6d 48 98 d2 vfmadd132ps %zmm2,%zmm2,%zmm2 9543: 62 f1 64 48 58 db vaddps %zmm3,%zmm3,%zmm3 9549: 62 f2 5d 48 98 e4 vfmadd132ps %zmm4,%zmm4,%zmm4 954f: 62 f2 55 48 98 ed vfmadd132ps %zmm5,%zmm5,%zmm5 9555: 62 f1 4c 48 58 f6 vaddps %zmm6,%zmm6,%zmm6 955b: 62 f2 45 48 98 ff vfmadd132ps %zmm7,%zmm7,%zmm7 9561: 62 52 3d 48 98 c0 vfmadd132ps %zmm8,%zmm8,%zmm8 9567: 62 51 34 48 58 c9 vaddps %zmm9,%zmm9,%zmm9 956d: 62 52 2d 48 98 d2 vfmadd132ps %zmm10,%zmm10,%zmm10 9573: 62 52 25 48 98 db vfmadd132ps %zmm11,%zmm11,%zmm11 9579: c4 41 1c 58 e4 vaddps %ymm12,%ymm12,%ymm12 957e: 62 52 15 48 98 ed vfmadd132ps %zmm13,%zmm13,%zmm13 9584: 62 52 0d 48 98 f6 vfmadd132ps %zmm14,%zmm14,%zmm14 958a: 62 51 04 48 58 ff vaddps %zmm15,%zmm15,%zmm15 9590: 4c 29 cf sub %r9,%rdi 9593: 7f 9c jg 9531 9595: 41 59 pop %r9 9597: c3 retq 0000000000009598 : 9598: 41 51 push %r9 959a: 49 c7 c1 10 00 00 00 mov $0x10,%r9 95a1: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 95a6: c4 e2 7d 18 c8 vbroadcastss %xmm0,%ymm1 95ab: c5 fd 6f d1 vmovdqa %ymm1,%ymm2 95af: c5 fd 6f d9 vmovdqa %ymm1,%ymm3 95b3: c5 fd 6f e1 vmovdqa %ymm1,%ymm4 95b7: c5 fd 6f e9 vmovdqa %ymm1,%ymm5 95bb: c5 fd 6f f1 vmovdqa %ymm1,%ymm6 95bf: c5 fd 6f f9 vmovdqa %ymm1,%ymm7 95c3: c5 7d 6f c1 vmovdqa %ymm1,%ymm8 95c7: c5 7d 6f c9 vmovdqa %ymm1,%ymm9 95cb: c5 7d 6f d1 vmovdqa %ymm1,%ymm10 95cf: c5 7d 6f d9 vmovdqa %ymm1,%ymm11 95d3: c5 7d 6f e1 vmovdqa %ymm1,%ymm12 95d7: c5 7d 6f e9 vmovdqa %ymm1,%ymm13 95db: c5 7d 6f f1 vmovdqa %ymm1,%ymm14 95df: c5 7d 6f f9 vmovdqa %ymm1,%ymm15 00000000000095e3 : 95e3: c5 fc 58 c0 vaddps %ymm0,%ymm0,%ymm0 95e7: c4 e2 75 98 c9 vfmadd132ps %ymm1,%ymm1,%ymm1 95ec: c4 e2 6d 98 d2 vfmadd132ps %ymm2,%ymm2,%ymm2 95f1: c5 e4 58 db vaddps %ymm3,%ymm3,%ymm3 95f5: c4 e2 5d 98 e4 vfmadd132ps %ymm4,%ymm4,%ymm4 95fa: c4 e2 55 98 ed vfmadd132ps %ymm5,%ymm5,%ymm5 95ff: c5 cc 58 f6 vaddps %ymm6,%ymm6,%ymm6 9603: c4 e2 45 98 ff vfmadd132ps %ymm7,%ymm7,%ymm7 9608: c4 42 3d 98 c0 vfmadd132ps %ymm8,%ymm8,%ymm8 960d: c4 41 34 58 c9 vaddps %ymm9,%ymm9,%ymm9 9612: c4 42 2d 98 d2 vfmadd132ps %ymm10,%ymm10,%ymm10 9617: c4 42 25 98 db vfmadd132ps %ymm11,%ymm11,%ymm11 961c: c4 41 1c 58 e4 vaddps %ymm12,%ymm12,%ymm12 9621: c4 42 15 98 ed vfmadd132ps %ymm13,%ymm13,%ymm13 9626: c4 42 0d 98 f6 vfmadd132ps %ymm14,%ymm14,%ymm14 962b: c4 41 04 58 ff vaddps %ymm15,%ymm15,%ymm15 9630: 4c 29 cf sub %r9,%rdi 9633: 7f ae jg 95e3 9635: 41 59 pop %r9 9637: c3 retq 0000000000009638 : 9638: 41 51 push %r9 963a: 41 50 push %r8 963c: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9643: 66 49 0f 6e c9 movq %r9,%xmm1 9648: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 964d: 62 f2 7d 48 18 f6 vbroadcastss %xmm6,%zmm6 9653: 62 f1 7c 48 10 ee vmovups %zmm6,%zmm5 9659: 62 f1 7c 48 10 fe vmovups %zmm6,%zmm7 965f: 62 71 7c 48 10 c6 vmovups %zmm6,%zmm8 9665: 62 71 7c 48 10 ce vmovups %zmm6,%zmm9 966b: 62 71 7c 48 10 d6 vmovups %zmm6,%zmm10 9671: 62 71 7c 48 10 de vmovups %zmm6,%zmm11 9677: 62 71 7c 48 10 e6 vmovups %zmm6,%zmm12 967d: 62 71 7c 48 10 ee vmovups %zmm6,%zmm13 9683: 62 71 7c 48 10 f6 vmovups %zmm6,%zmm14 9689: 62 71 7c 48 10 fe vmovups %zmm6,%zmm15 000000000000968f : 968f: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 9695: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 969b: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96a1: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96a7: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96ad: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96b3: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96b9: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96bf: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96c5: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96cb: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96d1: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96d7: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96dd: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96e3: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96e9: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96ef: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96f5: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 96fb: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 9701: 62 f2 55 48 98 fe vfmadd132ps %zmm6,%zmm5,%zmm7 9707: 4c 29 cf sub %r9,%rdi 970a: 75 83 jne 968f 970c: 66 48 0f 7e c8 movq %xmm1,%rax 9711: c5 f8 77 vzeroupper 9714: 41 58 pop %r8 9716: 41 59 pop %r9 9718: c3 retq 0000000000009719 : 9719: 41 51 push %r9 971b: 41 50 push %r8 971d: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9724: 66 49 0f 6e c9 movq %r9,%xmm1 9729: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 972e: c4 e2 7d 18 f6 vbroadcastss %xmm6,%ymm6 9733: c5 fc 10 ee vmovups %ymm6,%ymm5 9737: c5 fc 10 fe vmovups %ymm6,%ymm7 973b: c5 7c 10 c6 vmovups %ymm6,%ymm8 973f: c5 7c 10 ce vmovups %ymm6,%ymm9 9743: c5 7c 10 d6 vmovups %ymm6,%ymm10 9747: c5 7c 10 de vmovups %ymm6,%ymm11 974b: c5 7c 10 e6 vmovups %ymm6,%ymm12 974f: c5 7c 10 ee vmovups %ymm6,%ymm13 9753: c5 7c 10 f6 vmovups %ymm6,%ymm14 9757: c5 7c 10 fe vmovups %ymm6,%ymm15 000000000000975b : 975b: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9760: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9765: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 976a: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 976f: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9774: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9779: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 977e: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9783: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9788: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 978d: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9792: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 9797: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 979c: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97a1: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97a6: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97ab: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97b0: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97b5: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97ba: c4 e2 55 98 fe vfmadd132ps %ymm6,%ymm5,%ymm7 97bf: 4c 29 cf sub %r9,%rdi 97c2: 75 97 jne 975b 97c4: 66 48 0f 7e c8 movq %xmm1,%rax 97c9: c5 f8 77 vzeroupper 97cc: 41 58 pop %r8 97ce: 41 59 pop %r9 97d0: c3 retq 00000000000097d1 : 97d1: 41 51 push %r9 97d3: 41 50 push %r8 97d5: c5 f8 77 vzeroupper 97d8: 49 c7 c1 14 00 00 00 mov $0x14,%r9 97df: 66 49 0f 6e c9 movq %r9,%xmm1 97e4: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 97e9: c4 e2 79 18 f6 vbroadcastss %xmm6,%xmm6 97ee: c5 f8 10 ee vmovups %xmm6,%xmm5 97f2: c5 f8 10 fe vmovups %xmm6,%xmm7 97f6: c5 78 10 c6 vmovups %xmm6,%xmm8 97fa: c5 78 10 ce vmovups %xmm6,%xmm9 97fe: c5 78 10 d6 vmovups %xmm6,%xmm10 9802: c5 78 10 de vmovups %xmm6,%xmm11 9806: c5 78 10 e6 vmovups %xmm6,%xmm12 980a: c5 78 10 ee vmovups %xmm6,%xmm13 980e: c5 78 10 f6 vmovups %xmm6,%xmm14 9812: c5 78 10 fe vmovups %xmm6,%xmm15 0000000000009816 : 9816: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 981b: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9820: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9825: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 982a: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 982f: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9834: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9839: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 983e: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9843: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9848: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 984d: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9852: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9857: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 985c: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9861: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9866: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 986b: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9870: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 9875: c4 e2 51 98 fe vfmadd132ps %xmm6,%xmm5,%xmm7 987a: 4c 29 cf sub %r9,%rdi 987d: 75 97 jne 9816 987f: 66 48 0f 7e c8 movq %xmm1,%rax 9884: c5 f8 77 vzeroupper 9887: 41 58 pop %r8 9889: 41 59 pop %r9 988b: c3 retq 000000000000988c : 988c: 41 51 push %r9 988e: 41 50 push %r8 9890: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9897: 66 49 0f 6e c9 movq %r9,%xmm1 989c: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 00000000000098a1 : 98a1: 0f 58 f6 addps %xmm6,%xmm6 98a4: 0f 58 f6 addps %xmm6,%xmm6 98a7: 0f 58 f6 addps %xmm6,%xmm6 98aa: 0f 58 f6 addps %xmm6,%xmm6 98ad: 0f 58 f6 addps %xmm6,%xmm6 98b0: 0f 58 f6 addps %xmm6,%xmm6 98b3: 0f 58 f6 addps %xmm6,%xmm6 98b6: 0f 58 f6 addps %xmm6,%xmm6 98b9: 0f 58 f6 addps %xmm6,%xmm6 98bc: 0f 58 f6 addps %xmm6,%xmm6 98bf: 0f 58 f6 addps %xmm6,%xmm6 98c2: 0f 58 f6 addps %xmm6,%xmm6 98c5: 0f 58 f6 addps %xmm6,%xmm6 98c8: 0f 58 f6 addps %xmm6,%xmm6 98cb: 0f 58 f6 addps %xmm6,%xmm6 98ce: 0f 58 f6 addps %xmm6,%xmm6 98d1: 0f 58 f6 addps %xmm6,%xmm6 98d4: 0f 58 f6 addps %xmm6,%xmm6 98d7: 0f 58 f6 addps %xmm6,%xmm6 98da: 0f 58 f6 addps %xmm6,%xmm6 98dd: 4c 29 cf sub %r9,%rdi 98e0: 75 bf jne 98a1 98e2: 66 48 0f 7e c8 movq %xmm1,%rax 98e7: 41 58 pop %r8 98e9: 41 59 pop %r9 98eb: c3 retq 00000000000098ec : 98ec: 41 51 push %r9 98ee: 41 50 push %r8 98f0: 49 c7 c1 14 00 00 00 mov $0x14,%r9 98f7: 66 49 0f 6e c9 movq %r9,%xmm1 98fc: f3 49 0f 2a f1 cvtsi2ss %r9,%xmm6 0000000000009901 : 9901: 0f 59 f6 mulps %xmm6,%xmm6 9904: 0f 59 f6 mulps %xmm6,%xmm6 9907: 0f 59 f6 mulps %xmm6,%xmm6 990a: 0f 59 f6 mulps %xmm6,%xmm6 990d: 0f 59 f6 mulps %xmm6,%xmm6 9910: 0f 59 f6 mulps %xmm6,%xmm6 9913: 0f 59 f6 mulps %xmm6,%xmm6 9916: 0f 59 f6 mulps %xmm6,%xmm6 9919: 0f 59 f6 mulps %xmm6,%xmm6 991c: 0f 59 f6 mulps %xmm6,%xmm6 991f: 0f 59 f6 mulps %xmm6,%xmm6 9922: 0f 59 f6 mulps %xmm6,%xmm6 9925: 0f 59 f6 mulps %xmm6,%xmm6 9928: 0f 59 f6 mulps %xmm6,%xmm6 992b: 0f 59 f6 mulps %xmm6,%xmm6 992e: 0f 59 f6 mulps %xmm6,%xmm6 9931: 0f 59 f6 mulps %xmm6,%xmm6 9934: 0f 59 f6 mulps %xmm6,%xmm6 9937: 0f 59 f6 mulps %xmm6,%xmm6 993a: 0f 59 f6 mulps %xmm6,%xmm6 993d: 4c 29 cf sub %r9,%rdi 9940: 75 bf jne 9901 9942: 66 48 0f 7e c8 movq %xmm1,%rax 9947: 41 58 pop %r8 9949: 41 59 pop %r9 994b: c3 retq 000000000000994c : 994c: 41 51 push %r9 994e: 41 50 push %r8 9950: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9957: f3 49 0f 2a e1 cvtsi2ss %r9,%xmm4 995c: f3 49 0f 2a d9 cvtsi2ss %r9,%xmm3 9961: f3 49 0f 2a d1 cvtsi2ss %r9,%xmm2 9966: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 996b: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 0000000000009970 : 9970: 0f 59 c0 mulps %xmm0,%xmm0 9973: 0f 59 c9 mulps %xmm1,%xmm1 9976: 0f 59 d2 mulps %xmm2,%xmm2 9979: 0f 59 db mulps %xmm3,%xmm3 997c: 0f 59 e4 mulps %xmm4,%xmm4 997f: 0f 59 c0 mulps %xmm0,%xmm0 9982: 0f 59 c9 mulps %xmm1,%xmm1 9985: 0f 59 d2 mulps %xmm2,%xmm2 9988: 0f 59 db mulps %xmm3,%xmm3 998b: 0f 59 e4 mulps %xmm4,%xmm4 998e: 0f 59 c0 mulps %xmm0,%xmm0 9991: 0f 59 c9 mulps %xmm1,%xmm1 9994: 0f 59 d2 mulps %xmm2,%xmm2 9997: 0f 59 db mulps %xmm3,%xmm3 999a: 0f 59 e4 mulps %xmm4,%xmm4 999d: 0f 59 c0 mulps %xmm0,%xmm0 99a0: 0f 59 c9 mulps %xmm1,%xmm1 99a3: 0f 59 d2 mulps %xmm2,%xmm2 99a6: 0f 59 db mulps %xmm3,%xmm3 99a9: 0f 59 e4 mulps %xmm4,%xmm4 99ac: 4c 29 cf sub %r9,%rdi 99af: 75 bf jne 9970 99b1: 66 48 0f 7e c8 movq %xmm1,%rax 99b6: 41 58 pop %r8 99b8: 41 59 pop %r9 99ba: c3 retq 00000000000099bb : 99bb: 41 51 push %r9 99bd: 41 50 push %r8 99bf: 49 c7 c1 14 00 00 00 mov $0x14,%r9 99c6: f3 49 0f 2a e1 cvtsi2ss %r9,%xmm4 99cb: f3 49 0f 2a d9 cvtsi2ss %r9,%xmm3 99d0: f3 49 0f 2a d1 cvtsi2ss %r9,%xmm2 99d5: f3 49 0f 2a c9 cvtsi2ss %r9,%xmm1 99da: f3 49 0f 2a c1 cvtsi2ss %r9,%xmm0 00000000000099df : 99df: 0f 58 c0 addps %xmm0,%xmm0 99e2: 0f 58 c9 addps %xmm1,%xmm1 99e5: 0f 58 d2 addps %xmm2,%xmm2 99e8: 0f 58 db addps %xmm3,%xmm3 99eb: 0f 58 e4 addps %xmm4,%xmm4 99ee: 0f 58 c0 addps %xmm0,%xmm0 99f1: 0f 58 c9 addps %xmm1,%xmm1 99f4: 0f 58 d2 addps %xmm2,%xmm2 99f7: 0f 58 db addps %xmm3,%xmm3 99fa: 0f 58 e4 addps %xmm4,%xmm4 99fd: 0f 58 c0 addps %xmm0,%xmm0 9a00: 0f 58 c9 addps %xmm1,%xmm1 9a03: 0f 58 d2 addps %xmm2,%xmm2 9a06: 0f 58 db addps %xmm3,%xmm3 9a09: 0f 58 e4 addps %xmm4,%xmm4 9a0c: 0f 58 c0 addps %xmm0,%xmm0 9a0f: 0f 58 c9 addps %xmm1,%xmm1 9a12: 0f 58 d2 addps %xmm2,%xmm2 9a15: 0f 58 db addps %xmm3,%xmm3 9a18: 0f 58 e4 addps %xmm4,%xmm4 9a1b: 4c 29 cf sub %r9,%rdi 9a1e: 75 bf jne 99df 9a20: 66 48 0f 7e c8 movq %xmm1,%rax 9a25: 41 58 pop %r8 9a27: 41 59 pop %r9 9a29: c3 retq 0000000000009a2a : 9a2a: 53 push %rbx 9a2b: 51 push %rcx 9a2c: 41 50 push %r8 9a2e: 41 51 push %r9 9a30: 41 52 push %r10 9a32: 41 53 push %r11 9a34: 41 54 push %r12 9a36: 41 55 push %r13 9a38: 41 56 push %r14 9a3a: 41 57 push %r15 9a3c: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9a43: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9a4a: 4c 89 c3 mov %r8,%rbx 9a4d: 4c 89 c1 mov %r8,%rcx 9a50: 4d 89 c2 mov %r8,%r10 9a53: 4d 89 c3 mov %r8,%r11 9a56: 4d 89 c4 mov %r8,%r12 9a59: 4d 89 c5 mov %r8,%r13 9a5c: 4d 89 c6 mov %r8,%r14 9a5f: 4d 89 cf mov %r9,%r15 0000000000009a62 : 9a62: 4d 0f af f9 imul %r9,%r15 9a66: 4d 0f af f9 imul %r9,%r15 9a6a: 4d 0f af f9 imul %r9,%r15 9a6e: 4d 0f af f9 imul %r9,%r15 9a72: 4d 0f af f9 imul %r9,%r15 9a76: 4d 0f af f9 imul %r9,%r15 9a7a: 4d 0f af f9 imul %r9,%r15 9a7e: 4d 0f af f9 imul %r9,%r15 9a82: 4d 0f af f9 imul %r9,%r15 9a86: 4d 0f af f9 imul %r9,%r15 9a8a: 4d 0f af f9 imul %r9,%r15 9a8e: 4d 0f af f9 imul %r9,%r15 9a92: 4d 0f af f9 imul %r9,%r15 9a96: 4d 0f af f9 imul %r9,%r15 9a9a: 4d 0f af f9 imul %r9,%r15 9a9e: 4d 0f af f9 imul %r9,%r15 9aa2: 4d 0f af f9 imul %r9,%r15 9aa6: 4d 0f af f9 imul %r9,%r15 9aaa: 4d 0f af f9 imul %r9,%r15 9aae: 4d 0f af f9 imul %r9,%r15 9ab2: 4c 29 cf sub %r9,%rdi 9ab5: 75 ab jne 9a62 9ab7: 41 5f pop %r15 9ab9: 41 5e pop %r14 9abb: 41 5d pop %r13 9abd: 41 5c pop %r12 9abf: 41 5b pop %r11 9ac1: 41 5a pop %r10 9ac3: 41 59 pop %r9 9ac5: 41 58 pop %r8 9ac7: 59 pop %rcx 9ac8: 5b pop %rbx 9ac9: c3 retq 0000000000009aca : 9aca: 53 push %rbx 9acb: 51 push %rcx 9acc: 41 50 push %r8 9ace: 41 51 push %r9 9ad0: 41 52 push %r10 9ad2: 41 53 push %r11 9ad4: 41 54 push %r12 9ad6: 41 55 push %r13 9ad8: 41 56 push %r14 9ada: 41 57 push %r15 9adc: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9ae3: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9aea: 4c 89 c3 mov %r8,%rbx 9aed: 4c 89 c1 mov %r8,%rcx 9af0: 4d 89 c2 mov %r8,%r10 9af3: 4d 89 c3 mov %r8,%r11 9af6: 4d 89 c4 mov %r8,%r12 9af9: 4d 89 c5 mov %r8,%r13 9afc: 4d 89 c6 mov %r8,%r14 9aff: 4d 89 cf mov %r9,%r15 0000000000009b02 : 9b02: 66 45 0f af f9 imul %r9w,%r15w 9b07: 66 45 0f af f9 imul %r9w,%r15w 9b0c: 66 45 0f af f9 imul %r9w,%r15w 9b11: 66 45 0f af f9 imul %r9w,%r15w 9b16: 66 45 0f af f9 imul %r9w,%r15w 9b1b: 66 45 0f af f9 imul %r9w,%r15w 9b20: 66 45 0f af f9 imul %r9w,%r15w 9b25: 66 45 0f af f9 imul %r9w,%r15w 9b2a: 66 45 0f af f9 imul %r9w,%r15w 9b2f: 66 45 0f af f9 imul %r9w,%r15w 9b34: 66 45 0f af f9 imul %r9w,%r15w 9b39: 66 45 0f af f9 imul %r9w,%r15w 9b3e: 66 45 0f af f9 imul %r9w,%r15w 9b43: 66 45 0f af f9 imul %r9w,%r15w 9b48: 66 45 0f af f9 imul %r9w,%r15w 9b4d: 66 45 0f af f9 imul %r9w,%r15w 9b52: 66 45 0f af f9 imul %r9w,%r15w 9b57: 66 45 0f af f9 imul %r9w,%r15w 9b5c: 66 45 0f af f9 imul %r9w,%r15w 9b61: 66 45 0f af f9 imul %r9w,%r15w 9b66: 4c 29 cf sub %r9,%rdi 9b69: 75 97 jne 9b02 9b6b: 41 5f pop %r15 9b6d: 41 5e pop %r14 9b6f: 41 5d pop %r13 9b71: 41 5c pop %r12 9b73: 41 5b pop %r11 9b75: 41 5a pop %r10 9b77: 41 59 pop %r9 9b79: 41 58 pop %r8 9b7b: 59 pop %rcx 9b7c: 5b pop %rbx 9b7d: c3 retq 0000000000009b7e : 9b7e: 53 push %rbx 9b7f: 51 push %rcx 9b80: 41 50 push %r8 9b82: 41 51 push %r9 9b84: 41 52 push %r10 9b86: 41 53 push %r11 9b88: 41 54 push %r12 9b8a: 41 55 push %r13 9b8c: 41 56 push %r14 9b8e: 41 57 push %r15 9b90: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9b97: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9b9e: 4c 89 c3 mov %r8,%rbx 9ba1: 4c 89 c1 mov %r8,%rcx 9ba4: 4d 89 c2 mov %r8,%r10 9ba7: 4d 89 c3 mov %r8,%r11 9baa: 4d 89 c4 mov %r8,%r12 9bad: 4d 89 c5 mov %r8,%r13 9bb0: 4d 89 c6 mov %r8,%r14 9bb3: 4d 89 cf mov %r9,%r15 0000000000009bb6 : 9bb6: 66 45 0f af f9 imul %r9w,%r15w 9bbb: 66 45 0f af f1 imul %r9w,%r14w 9bc0: 66 45 0f af e9 imul %r9w,%r13w 9bc5: 66 45 0f af e1 imul %r9w,%r12w 9bca: 66 45 0f af d9 imul %r9w,%r11w 9bcf: 66 45 0f af f9 imul %r9w,%r15w 9bd4: 66 45 0f af f1 imul %r9w,%r14w 9bd9: 66 45 0f af e9 imul %r9w,%r13w 9bde: 66 45 0f af e1 imul %r9w,%r12w 9be3: 66 45 0f af d9 imul %r9w,%r11w 9be8: 66 45 0f af f9 imul %r9w,%r15w 9bed: 66 45 0f af f1 imul %r9w,%r14w 9bf2: 66 45 0f af e9 imul %r9w,%r13w 9bf7: 66 45 0f af e1 imul %r9w,%r12w 9bfc: 66 45 0f af d9 imul %r9w,%r11w 9c01: 66 45 0f af f9 imul %r9w,%r15w 9c06: 66 45 0f af f1 imul %r9w,%r14w 9c0b: 66 45 0f af e9 imul %r9w,%r13w 9c10: 66 45 0f af e1 imul %r9w,%r12w 9c15: 66 45 0f af d9 imul %r9w,%r11w 9c1a: 4c 29 cf sub %r9,%rdi 9c1d: 75 97 jne 9bb6 9c1f: 41 5f pop %r15 9c21: 41 5e pop %r14 9c23: 41 5d pop %r13 9c25: 41 5c pop %r12 9c27: 41 5b pop %r11 9c29: 41 5a pop %r10 9c2b: 41 59 pop %r9 9c2d: 41 58 pop %r8 9c2f: 59 pop %rcx 9c30: 5b pop %rbx 9c31: c3 retq 0000000000009c32 : 9c32: 53 push %rbx 9c33: 51 push %rcx 9c34: 56 push %rsi 9c35: 41 50 push %r8 9c37: 41 51 push %r9 9c39: 41 52 push %r10 9c3b: 41 53 push %r11 9c3d: 41 54 push %r12 9c3f: 41 55 push %r13 9c41: 41 56 push %r14 9c43: 41 57 push %r15 9c45: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9c4c: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9c53: 4c 89 c3 mov %r8,%rbx 9c56: 4c 89 c1 mov %r8,%rcx 9c59: 4d 89 c2 mov %r8,%r10 9c5c: 4d 89 c3 mov %r8,%r11 9c5f: 4d 89 c4 mov %r8,%r12 9c62: 4d 89 c5 mov %r8,%r13 9c65: 4d 89 c6 mov %r8,%r14 9c68: 4d 89 cf mov %r9,%r15 0000000000009c6b : 9c6b: 4d 0f af f9 imul %r9,%r15 9c6f: 4d 89 cf mov %r9,%r15 9c72: 4d 0f af f1 imul %r9,%r14 9c76: 4d 89 ce mov %r9,%r14 9c79: 4d 0f af e9 imul %r9,%r13 9c7d: 4d 89 cd mov %r9,%r13 9c80: 4d 0f af e1 imul %r9,%r12 9c84: 4d 89 cc mov %r9,%r12 9c87: 4d 0f af d9 imul %r9,%r11 9c8b: 4d 89 cb mov %r9,%r11 9c8e: 4d 0f af d1 imul %r9,%r10 9c92: 4d 89 ca mov %r9,%r10 9c95: 4d 0f af c1 imul %r9,%r8 9c99: 4d 89 c8 mov %r9,%r8 9c9c: 49 0f af d9 imul %r9,%rbx 9ca0: 4c 89 cb mov %r9,%rbx 9ca3: 49 0f af c9 imul %r9,%rcx 9ca7: 4c 89 c9 mov %r9,%rcx 9caa: 49 0f af f1 imul %r9,%rsi 9cae: 4c 89 ce mov %r9,%rsi 9cb1: 4d 0f af f9 imul %r9,%r15 9cb5: 4d 89 cf mov %r9,%r15 9cb8: 4d 0f af f1 imul %r9,%r14 9cbc: 4d 89 ce mov %r9,%r14 9cbf: 4d 0f af e9 imul %r9,%r13 9cc3: 4d 89 cd mov %r9,%r13 9cc6: 4d 0f af e1 imul %r9,%r12 9cca: 4d 89 cc mov %r9,%r12 9ccd: 4d 0f af d9 imul %r9,%r11 9cd1: 4d 89 cb mov %r9,%r11 9cd4: 4d 0f af d1 imul %r9,%r10 9cd8: 4d 89 ca mov %r9,%r10 9cdb: 4d 0f af c1 imul %r9,%r8 9cdf: 4d 89 c8 mov %r9,%r8 9ce2: 49 0f af d9 imul %r9,%rbx 9ce6: 4c 89 cb mov %r9,%rbx 9ce9: 49 0f af c9 imul %r9,%rcx 9ced: 4c 89 c9 mov %r9,%rcx 9cf0: 49 0f af f1 imul %r9,%rsi 9cf4: 4c 89 ce mov %r9,%rsi 9cf7: 4c 29 cf sub %r9,%rdi 9cfa: 0f 85 6b ff ff ff jne 9c6b 9d00: 41 5f pop %r15 9d02: 41 5e pop %r14 9d04: 41 5d pop %r13 9d06: 41 5c pop %r12 9d08: 41 5b pop %r11 9d0a: 41 5a pop %r10 9d0c: 41 59 pop %r9 9d0e: 41 58 pop %r8 9d10: 5e pop %rsi 9d11: 59 pop %rcx 9d12: 5b pop %rbx 9d13: c3 retq 0000000000009d14 : 9d14: 53 push %rbx 9d15: 51 push %rcx 9d16: 56 push %rsi 9d17: 41 50 push %r8 9d19: 41 51 push %r9 9d1b: 41 52 push %r10 9d1d: 41 53 push %r11 9d1f: 41 54 push %r12 9d21: 41 55 push %r13 9d23: 41 56 push %r14 9d25: 41 57 push %r15 9d27: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9d2e: 49 c7 c1 14 00 00 00 mov $0x14,%r9 9d35: 4c 89 c3 mov %r8,%rbx 9d38: 4c 89 c1 mov %r8,%rcx 9d3b: 4d 89 c2 mov %r8,%r10 9d3e: 4d 89 c3 mov %r8,%r11 9d41: 4d 89 c4 mov %r8,%r12 9d44: 4d 89 c5 mov %r8,%r13 9d47: 4d 89 c6 mov %r8,%r14 9d4a: 4d 89 cf mov %r9,%r15 0000000000009d4d : 9d4d: 4d 0f af f9 imul %r9,%r15 9d51: 66 45 0f af f1 imul %r9w,%r14w 9d56: 4d 0f af e9 imul %r9,%r13 9d5a: 66 45 0f af e1 imul %r9w,%r12w 9d5f: 4d 0f af d9 imul %r9,%r11 9d63: 66 45 0f af d1 imul %r9w,%r10w 9d68: 4d 0f af c1 imul %r9,%r8 9d6c: 66 41 0f af d9 imul %r9w,%bx 9d71: 49 0f af c9 imul %r9,%rcx 9d75: 66 41 0f af f1 imul %r9w,%si 9d7a: 4d 0f af f9 imul %r9,%r15 9d7e: 66 45 0f af f1 imul %r9w,%r14w 9d83: 4d 0f af e9 imul %r9,%r13 9d87: 66 45 0f af e1 imul %r9w,%r12w 9d8c: 4d 0f af d9 imul %r9,%r11 9d90: 66 45 0f af d1 imul %r9w,%r10w 9d95: 4d 0f af c1 imul %r9,%r8 9d99: 66 41 0f af d9 imul %r9w,%bx 9d9e: 49 0f af c9 imul %r9,%rcx 9da2: 66 41 0f af f1 imul %r9w,%si 9da7: 4c 29 cf sub %r9,%rdi 9daa: 75 a1 jne 9d4d 9dac: 41 5f pop %r15 9dae: 41 5e pop %r14 9db0: 41 5d pop %r13 9db2: 41 5c pop %r12 9db4: 41 5b pop %r11 9db6: 41 5a pop %r10 9db8: 41 59 pop %r9 9dba: 41 58 pop %r8 9dbc: 5e pop %rsi 9dbd: 59 pop %rcx 9dbe: 5b pop %rbx 9dbf: c3 retq 0000000000009dc0 : 9dc0: 53 push %rbx 9dc1: 51 push %rcx 9dc2: 52 push %rdx 9dc3: 56 push %rsi 9dc4: 41 50 push %r8 9dc6: 41 51 push %r9 9dc8: 41 52 push %r10 9dca: 41 53 push %r11 9dcc: 41 54 push %r12 9dce: 41 55 push %r13 9dd0: 41 56 push %r14 9dd2: 41 57 push %r15 9dd4: 49 c7 c0 01 00 00 00 mov $0x1,%r8 9ddb: 49 c7 c1 18 00 00 00 mov $0x18,%r9 9de2: 4c 89 c3 mov %r8,%rbx 9de5: 4c 89 c1 mov %r8,%rcx 9de8: 4c 89 c6 mov %r8,%rsi 9deb: 4d 89 c2 mov %r8,%r10 9dee: 4d 89 c3 mov %r8,%r11 9df1: 4d 89 c4 mov %r8,%r12 9df4: 4d 89 c5 mov %r8,%r13 9df7: 4d 89 c6 mov %r8,%r14 9dfa: 4d 89 cf mov %r9,%r15 0000000000009dfd : 9dfd: 4d 0f af f9 imul %r9,%r15 9e01: 66 45 0f af f1 imul %r9w,%r14w 9e06: 66 45 0f af e9 imul %r9w,%r13w 9e0b: 4d 0f af e1 imul %r9,%r12 9e0f: 66 45 0f af d9 imul %r9w,%r11w 9e14: 66 45 0f af d1 imul %r9w,%r10w 9e19: 4d 0f af c1 imul %r9,%r8 9e1d: 66 45 0f af f1 imul %r9w,%r14w 9e22: 66 45 0f af e9 imul %r9w,%r13w 9e27: 49 0f af c9 imul %r9,%rcx 9e2b: 66 45 0f af d9 imul %r9w,%r11w 9e30: 66 45 0f af d1 imul %r9w,%r10w 9e35: 49 0f af d9 imul %r9,%rbx 9e39: 66 45 0f af f1 imul %r9w,%r14w 9e3e: 66 45 0f af e9 imul %r9w,%r13w 9e43: 49 0f af c1 imul %r9,%rax 9e47: 66 45 0f af d9 imul %r9w,%r11w 9e4c: 66 45 0f af d1 imul %r9w,%r10w 9e51: 49 0f af f1 imul %r9,%rsi 9e55: 66 45 0f af f1 imul %r9w,%r14w 9e5a: 66 45 0f af e9 imul %r9w,%r13w 9e5f: 49 0f af d1 imul %r9,%rdx 9e63: 66 45 0f af d9 imul %r9w,%r11w 9e68: 66 45 0f af d1 imul %r9w,%r10w 9e6d: 4c 29 cf sub %r9,%rdi 9e70: 7d 8b jge 9dfd 9e72: 41 5f pop %r15 9e74: 41 5e pop %r14 9e76: 41 5d pop %r13 9e78: 41 5c pop %r12 9e7a: 41 5b pop %r11 9e7c: 41 5a pop %r10 9e7e: 41 59 pop %r9 9e80: 41 58 pop %r8 9e82: 5e pop %rsi 9e83: 5a pop %rdx 9e84: 59 pop %rcx 9e85: 5b pop %rbx 9e86: c3 retq 0000000000009e87 : 9e87: 53 push %rbx 9e88: 51 push %rcx 9e89: 41 50 push %r8 9e8b: 41 51 push %r9 9e8d: 49 c7 c1 14 00 00 00 mov $0x14,%r9 0000000000009e94 : 9e94: 48 89 3e mov %rdi,(%rsi) 9e97: 48 89 7e 40 mov %rdi,0x40(%rsi) 9e9b: 48 89 be 80 00 00 00 mov %rdi,0x80(%rsi) 9ea2: 48 89 be c0 00 00 00 mov %rdi,0xc0(%rsi) 9ea9: 48 89 be 00 01 00 00 mov %rdi,0x100(%rsi) 9eb0: 48 89 be 40 01 00 00 mov %rdi,0x140(%rsi) 9eb7: 48 89 be 80 01 00 00 mov %rdi,0x180(%rsi) 9ebe: 48 89 be c0 01 00 00 mov %rdi,0x1c0(%rsi) 9ec5: 48 89 be 00 02 00 00 mov %rdi,0x200(%rsi) 9ecc: 48 89 be 40 02 00 00 mov %rdi,0x240(%rsi) 9ed3: 48 89 be 80 02 00 00 mov %rdi,0x280(%rsi) 9eda: 48 89 be c0 02 00 00 mov %rdi,0x2c0(%rsi) 9ee1: 48 89 be 00 03 00 00 mov %rdi,0x300(%rsi) 9ee8: 48 89 be 40 03 00 00 mov %rdi,0x340(%rsi) 9eef: 48 89 be 80 03 00 00 mov %rdi,0x380(%rsi) 9ef6: 48 89 be c0 03 00 00 mov %rdi,0x3c0(%rsi) 9efd: 48 89 be 00 04 00 00 mov %rdi,0x400(%rsi) 9f04: 48 89 be 40 04 00 00 mov %rdi,0x440(%rsi) 9f0b: 48 89 be 80 04 00 00 mov %rdi,0x480(%rsi) 9f12: 48 89 be c0 04 00 00 mov %rdi,0x4c0(%rsi) 9f19: 4c 29 cf sub %r9,%rdi 9f1c: 0f 85 72 ff ff ff jne 9e94 9f22: 41 59 pop %r9 9f24: 41 58 pop %r8 9f26: 59 pop %rcx 9f27: 5b pop %rbx 9f28: c3 retq 0000000000009f29 : 9f29: 53 push %rbx 9f2a: 51 push %rcx 9f2b: 41 50 push %r8 9f2d: 41 51 push %r9 9f2f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 0000000000009f36 : 9f36: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 9f3b: 66 44 0f 6f 5e 40 movdqa 0x40(%rsi),%xmm11 9f41: 66 44 0f 6f a6 80 00 movdqa 0x80(%rsi),%xmm12 9f48: 00 00 9f4a: 66 44 0f 6f ae c0 00 movdqa 0xc0(%rsi),%xmm13 9f51: 00 00 9f53: 66 44 0f 6f b6 00 01 movdqa 0x100(%rsi),%xmm14 9f5a: 00 00 9f5c: 66 44 0f 6f 96 40 01 movdqa 0x140(%rsi),%xmm10 9f63: 00 00 9f65: 66 44 0f 6f 9e 80 01 movdqa 0x180(%rsi),%xmm11 9f6c: 00 00 9f6e: 66 44 0f 6f a6 c0 01 movdqa 0x1c0(%rsi),%xmm12 9f75: 00 00 9f77: 66 44 0f 6f ae 00 02 movdqa 0x200(%rsi),%xmm13 9f7e: 00 00 9f80: 66 44 0f 6f b6 40 02 movdqa 0x240(%rsi),%xmm14 9f87: 00 00 9f89: 66 44 0f 6f 96 80 02 movdqa 0x280(%rsi),%xmm10 9f90: 00 00 9f92: 66 44 0f 6f 9e c0 02 movdqa 0x2c0(%rsi),%xmm11 9f99: 00 00 9f9b: 66 44 0f 6f a6 00 03 movdqa 0x300(%rsi),%xmm12 9fa2: 00 00 9fa4: 66 44 0f 6f ae 40 03 movdqa 0x340(%rsi),%xmm13 9fab: 00 00 9fad: 66 44 0f 6f b6 80 03 movdqa 0x380(%rsi),%xmm14 9fb4: 00 00 9fb6: 66 44 0f 6f 96 c0 03 movdqa 0x3c0(%rsi),%xmm10 9fbd: 00 00 9fbf: 66 44 0f 6f 9e 00 04 movdqa 0x400(%rsi),%xmm11 9fc6: 00 00 9fc8: 66 44 0f 6f a6 40 04 movdqa 0x440(%rsi),%xmm12 9fcf: 00 00 9fd1: 66 44 0f 6f ae 80 04 movdqa 0x480(%rsi),%xmm13 9fd8: 00 00 9fda: 66 44 0f 6f b6 c0 04 movdqa 0x4c0(%rsi),%xmm14 9fe1: 00 00 9fe3: 4c 29 cf sub %r9,%rdi 9fe6: 0f 85 4a ff ff ff jne 9f36 9fec: 41 59 pop %r9 9fee: 41 58 pop %r8 9ff0: 59 pop %rcx 9ff1: 5b pop %rbx 9ff2: c3 retq 0000000000009ff3 : 9ff3: 53 push %rbx 9ff4: 51 push %rcx 9ff5: 41 50 push %r8 9ff7: 41 51 push %r9 9ff9: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a000 : a000: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 a005: 66 44 0f 6f 1e movdqa (%rsi),%xmm11 a00a: 66 44 0f 6f 26 movdqa (%rsi),%xmm12 a00f: 66 44 0f 6f 2e movdqa (%rsi),%xmm13 a014: 66 44 0f 6f 36 movdqa (%rsi),%xmm14 a019: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 a01e: 66 44 0f 6f 1e movdqa (%rsi),%xmm11 a023: 66 44 0f 6f 26 movdqa (%rsi),%xmm12 a028: 66 44 0f 6f 2e movdqa (%rsi),%xmm13 a02d: 66 44 0f 6f 36 movdqa (%rsi),%xmm14 a032: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 a037: 66 44 0f 6f 1e movdqa (%rsi),%xmm11 a03c: 66 44 0f 6f 26 movdqa (%rsi),%xmm12 a041: 66 44 0f 6f 2e movdqa (%rsi),%xmm13 a046: 66 44 0f 6f 36 movdqa (%rsi),%xmm14 a04b: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 a050: 66 44 0f 6f 1e movdqa (%rsi),%xmm11 a055: 66 44 0f 6f 26 movdqa (%rsi),%xmm12 a05a: 66 44 0f 6f 2e movdqa (%rsi),%xmm13 a05f: 66 44 0f 6f 36 movdqa (%rsi),%xmm14 a064: 4c 29 cf sub %r9,%rdi a067: 75 97 jne a000 a069: 41 59 pop %r9 a06b: 41 58 pop %r8 a06d: 59 pop %rcx a06e: 5b pop %rbx a06f: c3 retq 000000000000a070 : a070: 53 push %rbx a071: 51 push %rcx a072: 41 50 push %r8 a074: 41 51 push %r9 a076: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a07d : a07d: c5 7c 28 16 vmovaps (%rsi),%ymm10 a081: c5 7c 28 1e vmovaps (%rsi),%ymm11 a085: c5 7c 28 26 vmovaps (%rsi),%ymm12 a089: c5 7c 28 2e vmovaps (%rsi),%ymm13 a08d: c5 7c 28 36 vmovaps (%rsi),%ymm14 a091: c5 7c 28 16 vmovaps (%rsi),%ymm10 a095: c5 7c 28 1e vmovaps (%rsi),%ymm11 a099: c5 7c 28 26 vmovaps (%rsi),%ymm12 a09d: c5 7c 28 2e vmovaps (%rsi),%ymm13 a0a1: c5 7c 28 36 vmovaps (%rsi),%ymm14 a0a5: c5 7c 28 16 vmovaps (%rsi),%ymm10 a0a9: c5 7c 28 1e vmovaps (%rsi),%ymm11 a0ad: c5 7c 28 26 vmovaps (%rsi),%ymm12 a0b1: c5 7c 28 2e vmovaps (%rsi),%ymm13 a0b5: c5 7c 28 36 vmovaps (%rsi),%ymm14 a0b9: c5 7c 28 16 vmovaps (%rsi),%ymm10 a0bd: c5 7c 28 1e vmovaps (%rsi),%ymm11 a0c1: c5 7c 28 26 vmovaps (%rsi),%ymm12 a0c5: c5 7c 28 2e vmovaps (%rsi),%ymm13 a0c9: c5 7c 28 36 vmovaps (%rsi),%ymm14 a0cd: 4c 29 cf sub %r9,%rdi a0d0: 75 ab jne a07d a0d2: 41 59 pop %r9 a0d4: 41 58 pop %r8 a0d6: 59 pop %rcx a0d7: 5b pop %rbx a0d8: c3 retq 000000000000a0d9 : a0d9: 53 push %rbx a0da: 51 push %rcx a0db: 41 50 push %r8 a0dd: 41 51 push %r9 a0df: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a0e6 : a0e6: 62 71 7c 48 28 16 vmovaps (%rsi),%zmm10 a0ec: 62 71 7c 48 28 1e vmovaps (%rsi),%zmm11 a0f2: 62 71 7c 48 28 26 vmovaps (%rsi),%zmm12 a0f8: 62 71 7c 48 28 2e vmovaps (%rsi),%zmm13 a0fe: 62 71 7c 48 28 36 vmovaps (%rsi),%zmm14 a104: 62 71 7c 48 28 16 vmovaps (%rsi),%zmm10 a10a: 62 71 7c 48 28 1e vmovaps (%rsi),%zmm11 a110: 62 71 7c 48 28 26 vmovaps (%rsi),%zmm12 a116: 62 71 7c 48 28 2e vmovaps (%rsi),%zmm13 a11c: 62 71 7c 48 28 36 vmovaps (%rsi),%zmm14 a122: 62 71 7c 48 28 16 vmovaps (%rsi),%zmm10 a128: 62 71 7c 48 28 1e vmovaps (%rsi),%zmm11 a12e: 62 71 7c 48 28 26 vmovaps (%rsi),%zmm12 a134: 62 71 7c 48 28 2e vmovaps (%rsi),%zmm13 a13a: 62 71 7c 48 28 36 vmovaps (%rsi),%zmm14 a140: 62 71 7c 48 28 16 vmovaps (%rsi),%zmm10 a146: 62 71 7c 48 28 1e vmovaps (%rsi),%zmm11 a14c: 62 71 7c 48 28 26 vmovaps (%rsi),%zmm12 a152: 62 71 7c 48 28 2e vmovaps (%rsi),%zmm13 a158: 62 71 7c 48 28 36 vmovaps (%rsi),%zmm14 a15e: 4c 29 cf sub %r9,%rdi a161: 75 83 jne a0e6 a163: 41 59 pop %r9 a165: 41 58 pop %r8 a167: 59 pop %rcx a168: 5b pop %rbx a169: c3 retq 000000000000a16a : a16a: 53 push %rbx a16b: 51 push %rcx a16c: 41 50 push %r8 a16e: 41 51 push %r9 a170: 66 44 0f 6f 16 movdqa (%rsi),%xmm10 a175: 66 45 0f 6f da movdqa %xmm10,%xmm11 a17a: 66 45 0f 6f e2 movdqa %xmm10,%xmm12 a17f: 66 45 0f 6f ea movdqa %xmm10,%xmm13 a184: 66 45 0f 6f f2 movdqa %xmm10,%xmm14 a189: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a190 : a190: 66 44 0f 7f 12 movdqa %xmm10,(%rdx) a195: 66 44 0f 7f 1a movdqa %xmm11,(%rdx) a19a: 66 44 0f 7f 22 movdqa %xmm12,(%rdx) a19f: 66 44 0f 7f 2a movdqa %xmm13,(%rdx) a1a4: 66 44 0f 7f 32 movdqa %xmm14,(%rdx) a1a9: 66 44 0f 7f 12 movdqa %xmm10,(%rdx) a1ae: 66 44 0f 7f 1a movdqa %xmm11,(%rdx) a1b3: 66 44 0f 7f 22 movdqa %xmm12,(%rdx) a1b8: 66 44 0f 7f 2a movdqa %xmm13,(%rdx) a1bd: 66 44 0f 7f 32 movdqa %xmm14,(%rdx) a1c2: 66 44 0f 7f 12 movdqa %xmm10,(%rdx) a1c7: 66 44 0f 7f 1a movdqa %xmm11,(%rdx) a1cc: 66 44 0f 7f 22 movdqa %xmm12,(%rdx) a1d1: 66 44 0f 7f 2a movdqa %xmm13,(%rdx) a1d6: 66 44 0f 7f 32 movdqa %xmm14,(%rdx) a1db: 66 44 0f 7f 12 movdqa %xmm10,(%rdx) a1e0: 66 44 0f 7f 1a movdqa %xmm11,(%rdx) a1e5: 66 44 0f 7f 22 movdqa %xmm12,(%rdx) a1ea: 66 44 0f 7f 2a movdqa %xmm13,(%rdx) a1ef: 66 44 0f 7f 32 movdqa %xmm14,(%rdx) a1f4: 4c 29 cf sub %r9,%rdi a1f7: 75 97 jne a190 a1f9: 41 59 pop %r9 a1fb: 41 58 pop %r8 a1fd: 59 pop %rcx a1fe: 5b pop %rbx a1ff: c3 retq 000000000000a200 : a200: 53 push %rbx a201: 51 push %rcx a202: 41 50 push %r8 a204: 41 51 push %r9 a206: c5 7c 28 16 vmovaps (%rsi),%ymm10 a20a: c4 41 7c 28 da vmovaps %ymm10,%ymm11 a20f: c4 41 7c 28 e2 vmovaps %ymm10,%ymm12 a214: c4 41 7c 28 ea vmovaps %ymm10,%ymm13 a219: c4 41 7c 28 f2 vmovaps %ymm10,%ymm14 a21e: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a225 : a225: c5 7c 29 12 vmovaps %ymm10,(%rdx) a229: c5 7c 29 1a vmovaps %ymm11,(%rdx) a22d: c5 7c 29 22 vmovaps %ymm12,(%rdx) a231: c5 7c 29 2a vmovaps %ymm13,(%rdx) a235: c5 7c 29 32 vmovaps %ymm14,(%rdx) a239: c5 7c 29 12 vmovaps %ymm10,(%rdx) a23d: c5 7c 29 1a vmovaps %ymm11,(%rdx) a241: c5 7c 29 22 vmovaps %ymm12,(%rdx) a245: c5 7c 29 2a vmovaps %ymm13,(%rdx) a249: c5 7c 29 32 vmovaps %ymm14,(%rdx) a24d: c5 7c 29 12 vmovaps %ymm10,(%rdx) a251: c5 7c 29 1a vmovaps %ymm11,(%rdx) a255: c5 7c 29 22 vmovaps %ymm12,(%rdx) a259: c5 7c 29 2a vmovaps %ymm13,(%rdx) a25d: c5 7c 29 32 vmovaps %ymm14,(%rdx) a261: c5 7c 29 12 vmovaps %ymm10,(%rdx) a265: c5 7c 29 1a vmovaps %ymm11,(%rdx) a269: c5 7c 29 22 vmovaps %ymm12,(%rdx) a26d: c5 7c 29 2a vmovaps %ymm13,(%rdx) a271: c5 7c 29 32 vmovaps %ymm14,(%rdx) a275: 4c 29 cf sub %r9,%rdi a278: 75 ab jne a225 a27a: 41 59 pop %r9 a27c: 41 58 pop %r8 a27e: 59 pop %rcx a27f: 5b pop %rbx a280: c3 retq 000000000000a281 : a281: 53 push %rbx a282: 51 push %rcx a283: 41 50 push %r8 a285: 41 51 push %r9 a287: 62 71 7c 48 28 16 vmovaps (%rsi),%zmm10 a28d: 62 51 7c 48 28 da vmovaps %zmm10,%zmm11 a293: 62 51 7c 48 28 e2 vmovaps %zmm10,%zmm12 a299: 62 51 7c 48 28 ea vmovaps %zmm10,%zmm13 a29f: 62 51 7c 48 28 f2 vmovaps %zmm10,%zmm14 a2a5: 49 c7 c1 14 00 00 00 mov $0x14,%r9 000000000000a2ac : a2ac: 62 71 7c 48 29 12 vmovaps %zmm10,(%rdx) a2b2: 62 71 7c 48 29 1a vmovaps %zmm11,(%rdx) a2b8: 62 71 7c 48 29 22 vmovaps %zmm12,(%rdx) a2be: 62 71 7c 48 29 2a vmovaps %zmm13,(%rdx) a2c4: 62 71 7c 48 29 32 vmovaps %zmm14,(%rdx) a2ca: 62 71 7c 48 29 12 vmovaps %zmm10,(%rdx) a2d0: 62 71 7c 48 29 1a vmovaps %zmm11,(%rdx) a2d6: 62 71 7c 48 29 22 vmovaps %zmm12,(%rdx) a2dc: 62 71 7c 48 29 2a vmovaps %zmm13,(%rdx) a2e2: 62 71 7c 48 29 32 vmovaps %zmm14,(%rdx) a2e8: 62 71 7c 48 29 12 vmovaps %zmm10,(%rdx) a2ee: 62 71 7c 48 29 1a vmovaps %zmm11,(%rdx) a2f4: 62 71 7c 48 29 22 vmovaps %zmm12,(%rdx) a2fa: 62 71 7c 48 29 2a vmovaps %zmm13,(%rdx) a300: 62 71 7c 48 29 32 vmovaps %zmm14,(%rdx) a306: 62 71 7c 48 29 12 vmovaps %zmm10,(%rdx) a30c: 62 71 7c 48 29 1a vmovaps %zmm11,(%rdx) a312: 62 71 7c 48 29 22 vmovaps %zmm12,(%rdx) a318: 62 71 7c 48 29 2a vmovaps %zmm13,(%rdx) a31e: 62 71 7c 48 29 32 vmovaps %zmm14,(%rdx) a324: 4c 29 cf sub %r9,%rdi a327: 75 83 jne a2ac a329: 41 59 pop %r9 a32b: 41 58 pop %r8 a32d: 59 pop %rcx a32e: 5b pop %rbx a32f: c3 retq 000000000000a330 : a330: 53 push %rbx a331: 51 push %rcx a332: 41 50 push %r8 a334: 41 51 push %r9 a336: 41 52 push %r10 a338: 41 53 push %r11 a33a: 41 54 push %r12 a33c: 41 55 push %r13 a33e: 41 56 push %r14 a340: 41 57 push %r15 a342: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a349: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a350: 48 31 db xor %rbx,%rbx a353: 48 31 c9 xor %rcx,%rcx a356: 4d 31 d2 xor %r10,%r10 a359: 4d 31 db xor %r11,%r11 a35c: 4d 31 e4 xor %r12,%r12 a35f: 4d 31 ed xor %r13,%r13 a362: 4d 31 f6 xor %r14,%r14 a365: 4d 31 ff xor %r15,%r15 000000000000a368 : a368: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a36d: c4 42 8b f5 f0 pdep %r8,%r14,%r14 a372: c4 42 93 f5 e8 pdep %r8,%r13,%r13 a377: c4 42 9b f5 e0 pdep %r8,%r12,%r12 a37c: c4 42 a3 f5 d8 pdep %r8,%r11,%r11 a381: c4 42 ab f5 d0 pdep %r8,%r10,%r10 a386: c4 c2 f3 f5 c8 pdep %r8,%rcx,%rcx a38b: c4 c2 e3 f5 d8 pdep %r8,%rbx,%rbx a390: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a395: c4 42 8b f5 f0 pdep %r8,%r14,%r14 a39a: c4 42 93 f5 e8 pdep %r8,%r13,%r13 a39f: c4 42 9b f5 e0 pdep %r8,%r12,%r12 a3a4: c4 42 a3 f5 d8 pdep %r8,%r11,%r11 a3a9: c4 42 ab f5 d0 pdep %r8,%r10,%r10 a3ae: c4 c2 f3 f5 c8 pdep %r8,%rcx,%rcx a3b3: c4 c2 e3 f5 d8 pdep %r8,%rbx,%rbx a3b8: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a3bd: c4 42 8b f5 f0 pdep %r8,%r14,%r14 a3c2: c4 42 93 f5 e8 pdep %r8,%r13,%r13 a3c7: c4 42 9b f5 e0 pdep %r8,%r12,%r12 a3cc: 4c 29 cf sub %r9,%rdi a3cf: 75 97 jne a368 a3d1: 41 5f pop %r15 a3d3: 41 5e pop %r14 a3d5: 41 5d pop %r13 a3d7: 41 5c pop %r12 a3d9: 41 5b pop %r11 a3db: 41 5a pop %r10 a3dd: 41 59 pop %r9 a3df: 41 58 pop %r8 a3e1: 59 pop %rcx a3e2: 5b pop %rbx a3e3: c3 retq 000000000000a3e4 : a3e4: 53 push %rbx a3e5: 51 push %rcx a3e6: 56 push %rsi a3e7: 41 50 push %r8 a3e9: 41 51 push %r9 a3eb: 41 52 push %r10 a3ed: 41 53 push %r11 a3ef: 41 54 push %r12 a3f1: 41 55 push %r13 a3f3: 41 56 push %r14 a3f5: 41 57 push %r15 a3f7: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a3fe: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a405: 48 31 db xor %rbx,%rbx a408: 48 31 c9 xor %rcx,%rcx a40b: 48 31 f6 xor %rsi,%rsi a40e: 4d 31 d2 xor %r10,%r10 a411: 4d 31 db xor %r11,%r11 a414: 4d 31 e4 xor %r12,%r12 a417: 4d 31 ed xor %r13,%r13 a41a: 4d 31 f6 xor %r14,%r14 a41d: 4d 31 ff xor %r15,%r15 000000000000a420 : a420: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a425: 4d 0f af f1 imul %r9,%r14 a429: c4 42 93 f5 e8 pdep %r8,%r13,%r13 a42e: 4d 0f af e1 imul %r9,%r12 a432: c4 42 a3 f5 d8 pdep %r8,%r11,%r11 a437: 4d 0f af d1 imul %r9,%r10 a43b: c4 c2 f3 f5 c8 pdep %r8,%rcx,%rcx a440: 49 0f af d9 imul %r9,%rbx a444: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a449: 49 0f af f1 imul %r9,%rsi a44d: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a452: 4d 0f af f1 imul %r9,%r14 a456: c4 42 93 f5 e8 pdep %r8,%r13,%r13 a45b: 4d 0f af e1 imul %r9,%r12 a45f: c4 42 a3 f5 d8 pdep %r8,%r11,%r11 a464: 4d 0f af d1 imul %r9,%r10 a468: c4 c2 f3 f5 c8 pdep %r8,%rcx,%rcx a46d: 49 0f af d9 imul %r9,%rbx a471: c4 42 83 f5 f8 pdep %r8,%r15,%r15 a476: 49 0f af f1 imul %r9,%rsi a47a: 4c 29 cf sub %r9,%rdi a47d: 75 a1 jne a420 a47f: 41 5f pop %r15 a481: 41 5e pop %r14 a483: 41 5d pop %r13 a485: 41 5c pop %r12 a487: 41 5b pop %r11 a489: 41 5a pop %r10 a48b: 41 59 pop %r9 a48d: 41 58 pop %r8 a48f: 5e pop %rsi a490: 59 pop %rcx a491: 5b pop %rbx a492: c3 retq 000000000000a493 : a493: 53 push %rbx a494: 51 push %rcx a495: 41 50 push %r8 a497: 41 51 push %r9 a499: 41 52 push %r10 a49b: 41 53 push %r11 a49d: 41 54 push %r12 a49f: 41 55 push %r13 a4a1: 41 56 push %r14 a4a3: 41 57 push %r15 a4a5: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a4ac: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a4b3: 48 31 db xor %rbx,%rbx a4b6: 48 31 c9 xor %rcx,%rcx a4b9: 4d 31 d2 xor %r10,%r10 a4bc: 4d 31 db xor %r11,%r11 a4bf: 4d 31 e4 xor %r12,%r12 a4c2: 4d 31 ed xor %r13,%r13 a4c5: 4d 31 f6 xor %r14,%r14 a4c8: 4d 31 ff xor %r15,%r15 000000000000a4cb : a4cb: c4 42 82 f5 f8 pext %r8,%r15,%r15 a4d0: c4 42 8a f5 f0 pext %r8,%r14,%r14 a4d5: c4 42 92 f5 e8 pext %r8,%r13,%r13 a4da: c4 42 9a f5 e0 pext %r8,%r12,%r12 a4df: c4 42 a2 f5 d8 pext %r8,%r11,%r11 a4e4: c4 42 aa f5 d0 pext %r8,%r10,%r10 a4e9: c4 c2 f2 f5 c8 pext %r8,%rcx,%rcx a4ee: c4 c2 e2 f5 d8 pext %r8,%rbx,%rbx a4f3: c4 42 82 f5 f8 pext %r8,%r15,%r15 a4f8: c4 42 8a f5 f0 pext %r8,%r14,%r14 a4fd: c4 42 92 f5 e8 pext %r8,%r13,%r13 a502: c4 42 9a f5 e0 pext %r8,%r12,%r12 a507: c4 42 a2 f5 d8 pext %r8,%r11,%r11 a50c: c4 42 aa f5 d0 pext %r8,%r10,%r10 a511: c4 c2 f2 f5 c8 pext %r8,%rcx,%rcx a516: c4 c2 e2 f5 d8 pext %r8,%rbx,%rbx a51b: c4 42 82 f5 f8 pext %r8,%r15,%r15 a520: c4 42 8a f5 f0 pext %r8,%r14,%r14 a525: c4 42 92 f5 e8 pext %r8,%r13,%r13 a52a: c4 42 9a f5 e0 pext %r8,%r12,%r12 a52f: 4c 29 cf sub %r9,%rdi a532: 75 97 jne a4cb a534: 41 5f pop %r15 a536: 41 5e pop %r14 a538: 41 5d pop %r13 a53a: 41 5c pop %r12 a53c: 41 5b pop %r11 a53e: 41 5a pop %r10 a540: 41 59 pop %r9 a542: 41 58 pop %r8 a544: 59 pop %rcx a545: 5b pop %rbx a546: c3 retq 000000000000a547 : a547: 53 push %rbx a548: 41 50 push %r8 a54a: 41 51 push %r9 a54c: 41 57 push %r15 a54e: 41 56 push %r14 a550: 41 55 push %r13 a552: 41 54 push %r12 a554: 41 53 push %r11 a556: 41 52 push %r10 a558: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a55f: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a566: 48 31 db xor %rbx,%rbx 000000000000a569 : a569: 4d 89 fc mov %r15,%r12 a56c: 4d 89 e6 mov %r12,%r14 a56f: 4d 89 f5 mov %r14,%r13 a572: 4d 89 eb mov %r13,%r11 a575: 4d 89 df mov %r11,%r15 a578: 4d 89 fc mov %r15,%r12 a57b: 4d 89 e6 mov %r12,%r14 a57e: 4d 89 f5 mov %r14,%r13 a581: 4d 89 eb mov %r13,%r11 a584: 4d 89 df mov %r11,%r15 a587: 4d 89 fc mov %r15,%r12 a58a: 4d 89 e6 mov %r12,%r14 a58d: 4d 89 f5 mov %r14,%r13 a590: 4d 89 eb mov %r13,%r11 a593: 4d 89 df mov %r11,%r15 a596: 4d 89 fc mov %r15,%r12 a599: 4d 89 e6 mov %r12,%r14 a59c: 4d 89 f5 mov %r14,%r13 a59f: 4d 89 eb mov %r13,%r11 a5a2: 4d 89 df mov %r11,%r15 a5a5: 4c 29 cf sub %r9,%rdi a5a8: 75 bf jne a569 a5aa: 41 5a pop %r10 a5ac: 41 5b pop %r11 a5ae: 41 5c pop %r12 a5b0: 41 5d pop %r13 a5b2: 41 5e pop %r14 a5b4: 41 5f pop %r15 a5b6: 41 59 pop %r9 a5b8: 41 58 pop %r8 a5ba: 5b pop %rbx a5bb: c3 retq 000000000000a5bc : a5bc: 53 push %rbx a5bd: 51 push %rcx a5be: 41 50 push %r8 a5c0: 41 51 push %r9 a5c2: 41 57 push %r15 a5c4: 41 56 push %r14 a5c6: 41 55 push %r13 a5c8: 41 54 push %r12 a5ca: 41 53 push %r11 a5cc: 41 52 push %r10 a5ce: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a5d5: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a5dc: 48 31 db xor %rbx,%rbx 000000000000a5df : a5df: 4d 89 d7 mov %r10,%r15 a5e2: 4d 89 de mov %r11,%r14 a5e5: 4d 89 e5 mov %r12,%r13 a5e8: 49 89 c7 mov %rax,%r15 a5eb: 49 89 ce mov %rcx,%r14 a5ee: 4d 89 d7 mov %r10,%r15 a5f1: 4d 89 de mov %r11,%r14 a5f4: 4d 89 e5 mov %r12,%r13 a5f7: 49 89 c7 mov %rax,%r15 a5fa: 49 89 ce mov %rcx,%r14 a5fd: 4d 89 d7 mov %r10,%r15 a600: 4d 89 de mov %r11,%r14 a603: 4d 89 e5 mov %r12,%r13 a606: 49 89 c7 mov %rax,%r15 a609: 49 89 ce mov %rcx,%r14 a60c: 4d 89 d7 mov %r10,%r15 a60f: 4d 89 de mov %r11,%r14 a612: 4d 89 e5 mov %r12,%r13 a615: 49 89 c7 mov %rax,%r15 a618: 49 89 ce mov %rcx,%r14 a61b: 4c 29 cf sub %r9,%rdi a61e: 75 bf jne a5df a620: 41 5a pop %r10 a622: 41 5b pop %r11 a624: 41 5c pop %r12 a626: 41 5d pop %r13 a628: 41 5e pop %r14 a62a: 41 5f pop %r15 a62c: 41 59 pop %r9 a62e: 41 58 pop %r8 a630: 59 pop %rcx a631: 5b pop %rbx a632: c3 retq 000000000000a633 : a633: 53 push %rbx a634: 51 push %rcx a635: 41 50 push %r8 a637: 41 51 push %r9 a639: 41 57 push %r15 a63b: 41 56 push %r14 a63d: 41 55 push %r13 a63f: 41 54 push %r12 a641: 41 53 push %r11 a643: 41 52 push %r10 a645: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a64c: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a653: 48 31 db xor %rbx,%rbx 000000000000a656 : a656: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a65d: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a664: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a66b: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a672: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a679: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a680: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a687: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a68e: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a695: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a69c: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6a3: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6aa: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6b1: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6b8: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6bf: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6c6: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6cd: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6d4: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6db: 49 c7 c7 00 00 00 00 mov $0x0,%r15 a6e2: 4c 29 cf sub %r9,%rdi a6e5: 0f 85 6b ff ff ff jne a656 a6eb: 41 5a pop %r10 a6ed: 41 5b pop %r11 a6ef: 41 5c pop %r12 a6f1: 41 5d pop %r13 a6f3: 41 5e pop %r14 a6f5: 41 5f pop %r15 a6f7: 41 59 pop %r9 a6f9: 41 58 pop %r8 a6fb: 59 pop %rcx a6fc: 5b pop %rbx a6fd: c3 retq 000000000000a6fe : a6fe: 53 push %rbx a6ff: 51 push %rcx a700: 41 50 push %r8 a702: 41 51 push %r9 a704: 41 57 push %r15 a706: 41 56 push %r14 a708: 41 55 push %r13 a70a: 41 54 push %r12 a70c: 41 53 push %r11 a70e: 41 52 push %r10 a710: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a717: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a71e: 48 31 db xor %rbx,%rbx 000000000000a721 : a721: 4d 31 ff xor %r15,%r15 a724: 4d 31 ff xor %r15,%r15 a727: 4d 31 ff xor %r15,%r15 a72a: 4d 31 ff xor %r15,%r15 a72d: 4d 31 ff xor %r15,%r15 a730: 4d 31 ff xor %r15,%r15 a733: 4d 31 ff xor %r15,%r15 a736: 4d 31 ff xor %r15,%r15 a739: 4d 31 ff xor %r15,%r15 a73c: 4d 31 ff xor %r15,%r15 a73f: 4d 31 ff xor %r15,%r15 a742: 4d 31 ff xor %r15,%r15 a745: 4d 31 ff xor %r15,%r15 a748: 4d 31 ff xor %r15,%r15 a74b: 4d 31 ff xor %r15,%r15 a74e: 4d 31 ff xor %r15,%r15 a751: 4d 31 ff xor %r15,%r15 a754: 4d 31 ff xor %r15,%r15 a757: 4d 31 ff xor %r15,%r15 a75a: 4d 31 ff xor %r15,%r15 a75d: 4c 29 cf sub %r9,%rdi a760: 75 bf jne a721 a762: 41 5a pop %r10 a764: 41 5b pop %r11 a766: 41 5c pop %r12 a768: 41 5d pop %r13 a76a: 41 5e pop %r14 a76c: 41 5f pop %r15 a76e: 41 59 pop %r9 a770: 41 58 pop %r8 a772: 59 pop %rcx a773: 5b pop %rbx a774: c3 retq 000000000000a775 : a775: 53 push %rbx a776: 51 push %rcx a777: 41 50 push %r8 a779: 41 51 push %r9 a77b: 41 57 push %r15 a77d: 41 56 push %r14 a77f: 41 55 push %r13 a781: 41 54 push %r12 a783: 41 53 push %r11 a785: 41 52 push %r10 a787: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a78e: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a795: 48 31 db xor %rbx,%rbx 000000000000a798 : a798: 4d 29 ff sub %r15,%r15 a79b: 4d 29 ff sub %r15,%r15 a79e: 4d 29 ff sub %r15,%r15 a7a1: 4d 29 ff sub %r15,%r15 a7a4: 4d 29 ff sub %r15,%r15 a7a7: 4d 29 ff sub %r15,%r15 a7aa: 4d 29 ff sub %r15,%r15 a7ad: 4d 29 ff sub %r15,%r15 a7b0: 4d 29 ff sub %r15,%r15 a7b3: 4d 29 ff sub %r15,%r15 a7b6: 4d 29 ff sub %r15,%r15 a7b9: 4d 29 ff sub %r15,%r15 a7bc: 4d 29 ff sub %r15,%r15 a7bf: 4d 29 ff sub %r15,%r15 a7c2: 4d 29 ff sub %r15,%r15 a7c5: 4d 29 ff sub %r15,%r15 a7c8: 4d 29 ff sub %r15,%r15 a7cb: 4d 29 ff sub %r15,%r15 a7ce: 4d 29 ff sub %r15,%r15 a7d1: 4d 29 ff sub %r15,%r15 a7d4: 4c 29 cf sub %r9,%rdi a7d7: 75 bf jne a798 a7d9: 41 5a pop %r10 a7db: 41 5b pop %r11 a7dd: 41 5c pop %r12 a7df: 41 5d pop %r13 a7e1: 41 5e pop %r14 a7e3: 41 5f pop %r15 a7e5: 41 59 pop %r9 a7e7: 41 58 pop %r8 a7e9: 59 pop %rcx a7ea: 5b pop %rbx a7eb: c3 retq 000000000000a7ec : a7ec: 53 push %rbx a7ed: 51 push %rcx a7ee: 41 50 push %r8 a7f0: 41 51 push %r9 a7f2: 41 57 push %r15 a7f4: 41 56 push %r14 a7f6: 41 55 push %r13 a7f8: 41 54 push %r12 a7fa: 41 53 push %r11 a7fc: 41 52 push %r10 a7fe: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a805: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a80c: 48 31 db xor %rbx,%rbx 000000000000a80f : a80f: 49 83 c7 01 add $0x1,%r15 a813: 49 83 c7 02 add $0x2,%r15 a817: 49 83 c7 03 add $0x3,%r15 a81b: 49 83 c7 04 add $0x4,%r15 a81f: 49 83 c7 05 add $0x5,%r15 a823: 49 83 c7 06 add $0x6,%r15 a827: 49 83 c7 07 add $0x7,%r15 a82b: 49 83 c7 08 add $0x8,%r15 a82f: 49 83 c7 09 add $0x9,%r15 a833: 49 83 c7 0a add $0xa,%r15 a837: 49 83 c7 0b add $0xb,%r15 a83b: 49 83 c7 0c add $0xc,%r15 a83f: 49 83 c7 0d add $0xd,%r15 a843: 49 83 c7 0e add $0xe,%r15 a847: 49 83 c7 0f add $0xf,%r15 a84b: 49 83 c7 10 add $0x10,%r15 a84f: 49 83 c7 11 add $0x11,%r15 a853: 49 83 c7 12 add $0x12,%r15 a857: 49 83 c7 13 add $0x13,%r15 a85b: 49 83 c7 14 add $0x14,%r15 a85f: 4c 29 cf sub %r9,%rdi a862: 75 ab jne a80f a864: 41 5a pop %r10 a866: 41 5b pop %r11 a868: 41 5c pop %r12 a86a: 41 5d pop %r13 a86c: 41 5e pop %r14 a86e: 41 5f pop %r15 a870: 41 59 pop %r9 a872: 41 58 pop %r8 a874: 59 pop %rcx a875: 5b pop %rbx a876: c3 retq 000000000000a877 : a877: 53 push %rbx a878: 51 push %rcx a879: 41 50 push %r8 a87b: 41 51 push %r9 a87d: 41 57 push %r15 a87f: 41 56 push %r14 a881: 41 55 push %r13 a883: 41 54 push %r12 a885: 41 53 push %r11 a887: 41 52 push %r10 a889: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a890: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a897: 48 31 db xor %rbx,%rbx 000000000000a89a : a89a: 49 ff c7 inc %r15 a89d: 49 ff c7 inc %r15 a8a0: 49 ff c7 inc %r15 a8a3: 49 ff c7 inc %r15 a8a6: 49 ff c7 inc %r15 a8a9: 49 ff c7 inc %r15 a8ac: 49 ff c7 inc %r15 a8af: 49 ff c7 inc %r15 a8b2: 49 ff c7 inc %r15 a8b5: 49 ff c7 inc %r15 a8b8: 49 ff c7 inc %r15 a8bb: 49 ff c7 inc %r15 a8be: 49 ff c7 inc %r15 a8c1: 49 ff c7 inc %r15 a8c4: 49 ff c7 inc %r15 a8c7: 49 ff c7 inc %r15 a8ca: 49 ff c7 inc %r15 a8cd: 49 ff c7 inc %r15 a8d0: 49 ff c7 inc %r15 a8d3: 49 ff c7 inc %r15 a8d6: 4c 29 cf sub %r9,%rdi a8d9: 75 bf jne a89a a8db: 41 5a pop %r10 a8dd: 41 5b pop %r11 a8df: 41 5c pop %r12 a8e1: 41 5d pop %r13 a8e3: 41 5e pop %r14 a8e5: 41 5f pop %r15 a8e7: 41 59 pop %r9 a8e9: 41 58 pop %r8 a8eb: 59 pop %rcx a8ec: 5b pop %rbx a8ed: c3 retq 000000000000a8ee : a8ee: 53 push %rbx a8ef: 51 push %rcx a8f0: 41 50 push %r8 a8f2: 41 51 push %r9 a8f4: 41 57 push %r15 a8f6: 41 56 push %r14 a8f8: 41 55 push %r13 a8fa: 41 54 push %r12 a8fc: 41 53 push %r11 a8fe: 41 52 push %r10 a900: 49 c7 c0 01 00 00 00 mov $0x1,%r8 a907: 49 c7 c1 14 00 00 00 mov $0x14,%r9 a90e: 4d 31 ff xor %r15,%r15 a911: 49 f7 d7 not %r15 a914: 48 31 db xor %rbx,%rbx 000000000000a917 : a917: 49 ff cf dec %r15 a91a: 49 ff cf dec %r15 a91d: 49 ff cf dec %r15 a920: 49 ff cf dec %r15 a923: 49 ff cf dec %r15 a926: 49 ff cf dec %r15 a929: 49 ff cf dec %r15 a92c: 49 ff cf dec %r15 a92f: 49 ff cf dec %r15 a932: 49 ff cf dec %r15 a935: 49 ff cf dec %r15 a938: 49 ff cf dec %r15 a93b: 49 ff cf dec %r15 a93e: 49 ff cf dec %r15 a941: 49 ff cf dec %r15 a944: 49 ff cf dec %r15 a947: 49 ff cf dec %r15 a94a: 49 ff cf dec %r15 a94d: 49 ff cf dec %r15 a950: 49 ff cf dec %r15 a953: 4c 29 cf sub %r9,%rdi a956: 75 bf jne a917 a958: 41 5a pop %r10 a95a: 41 5b pop %r11 a95c: 41 5c pop %r12 a95e: 41 5d pop %r13 a960: 41 5e pop %r14 a962: 41 5f pop %r15 a964: 41 59 pop %r9 a966: 41 58 pop %r8 a968: 59 pop %rcx a969: 5b pop %rbx a96a: c3 retq a96b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 000000000000a970 : a970: f3 0f 1e fa endbr64 a974: 48 8b 35 85 37 00 00 mov 0x3785(%rip),%rsi # e100 a97b: e9 73 f6 ff ff jmpq 9ff3 000000000000a980 : a980: f3 0f 1e fa endbr64 a984: 48 8b 35 75 37 00 00 mov 0x3775(%rip),%rsi # e100 a98b: e9 99 f5 ff ff jmpq 9f29 000000000000a990 : a990: f3 0f 1e fa endbr64 a994: 48 8b 35 65 37 00 00 mov 0x3765(%rip),%rsi # e100 a99b: e9 e7 f4 ff ff jmpq 9e87 000000000000a9a0 : a9a0: f3 0f 1e fa endbr64 a9a4: 48 8d 35 15 37 00 00 lea 0x3715(%rip),%rsi # e0c0 a9ab: e9 c0 f6 ff ff jmpq a070 000000000000a9b0 : a9b0: f3 0f 1e fa endbr64 a9b4: 48 8d 35 05 37 00 00 lea 0x3705(%rip),%rsi # e0c0 a9bb: e9 19 f7 ff ff jmpq a0d9 000000000000a9c0 : a9c0: f3 0f 1e fa endbr64 a9c4: 48 8b 35 35 37 00 00 mov 0x3735(%rip),%rsi # e100 a9cb: 48 8d 15 6e 36 00 00 lea 0x366e(%rip),%rdx # e040 a9d2: e9 93 f7 ff ff jmpq a16a a9d7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) a9de: 00 00 000000000000a9e0 : a9e0: f3 0f 1e fa endbr64 a9e4: 48 8d 15 95 36 00 00 lea 0x3695(%rip),%rdx # e080 a9eb: 48 8d 35 ce 36 00 00 lea 0x36ce(%rip),%rsi # e0c0 a9f2: e9 09 f8 ff ff jmpq a200 a9f7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) a9fe: 00 00 000000000000aa00 : aa00: f3 0f 1e fa endbr64 aa04: 48 8d 15 75 36 00 00 lea 0x3675(%rip),%rdx # e080 aa0b: 48 8d 35 ae 36 00 00 lea 0x36ae(%rip),%rsi # e0c0 aa12: e9 6a f8 ff ff jmpq a281 aa17: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) aa1e: 00 00 000000000000aa20 : aa20: f3 0f 1e fa endbr64 aa24: 48 8d 35 95 36 00 00 lea 0x3695(%rip),%rsi # e0c0 aa2b: e9 03 e9 ff ff jmpq 9333 000000000000aa30 : aa30: f3 0f 1e fa endbr64 aa34: 48 8d 35 85 36 00 00 lea 0x3685(%rip),%rsi # e0c0 aa3b: e9 be e9 ff ff jmpq 93fe 000000000000aa40 : aa40: f3 0f 1e fa endbr64 aa44: 55 push %rbp aa45: 48 89 f5 mov %rsi,%rbp aa48: 53 push %rbx aa49: 48 89 fb mov %rdi,%rbx aa4c: 48 83 ec 58 sub $0x58,%rsp aa50: f3 0f 11 44 24 0c movss %xmm0,0xc(%rsp) aa56: 48 8d 74 24 10 lea 0x10(%rsp),%rsi aa5b: 48 8d 7c 24 20 lea 0x20(%rsp),%rdi aa60: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax aa67: 00 00 aa69: 48 89 44 24 48 mov %rax,0x48(%rsp) aa6e: 31 c0 xor %eax,%eax aa70: e8 db 65 ff ff callq 1050 aa75: 48 89 df mov %rbx,%rdi aa78: ff d5 callq *%rbp aa7a: 48 8d 74 24 18 lea 0x18(%rsp),%rsi aa7f: 48 8d 7c 24 30 lea 0x30(%rsp),%rdi aa84: e8 c7 65 ff ff callq 1050 aa89: 48 8b 74 24 38 mov 0x38(%rsp),%rsi aa8e: 48 2b 74 24 28 sub 0x28(%rsp),%rsi aa93: 48 ba cf f7 53 e3 a5 movabs $0x20c49ba5e353f7cf,%rdx aa9a: 9b c4 20 aa9d: 48 89 f0 mov %rsi,%rax aaa0: 48 8b 4c 24 30 mov 0x30(%rsp),%rcx aaa5: 48 2b 4c 24 20 sub 0x20(%rsp),%rcx aaaa: 48 c1 fe 3f sar $0x3f,%rsi aaae: 48 f7 ea imul %rdx aab1: 48 69 c9 e8 03 00 00 imul $0x3e8,%rcx,%rcx aab8: 48 c1 fa 07 sar $0x7,%rdx aabc: 48 29 f2 sub %rsi,%rdx aabf: 48 01 d1 add %rdx,%rcx aac2: 78 5c js ab20 aac4: 66 0f ef c9 pxor %xmm1,%xmm1 aac8: f3 48 0f 2a c9 cvtsi2ss %rcx,%xmm1 aacd: f3 0f 5a c9 cvtss2sd %xmm1,%xmm1 aad1: f2 0f 59 0d 9f 15 00 mulsd 0x159f(%rip),%xmm1 # c078 <_IO_stdin_used+0x1078> aad8: 00 aad9: 48 85 db test %rbx,%rbx aadc: 78 62 js ab40 aade: 66 0f ef c0 pxor %xmm0,%xmm0 aae2: f3 48 0f 2a c3 cvtsi2ss %rbx,%xmm0 aae7: f3 0f 5a c0 cvtss2sd %xmm0,%xmm0 aaeb: 48 8b 44 24 48 mov 0x48(%rsp),%rax aaf0: 64 48 33 04 25 28 00 xor %fs:0x28,%rax aaf7: 00 00 aaf9: f2 0f 5e c8 divsd %xmm0,%xmm1 aafd: f3 0f 10 05 83 15 00 movss 0x1583(%rip),%xmm0 # c088 <_IO_stdin_used+0x1088> ab04: 00 ab05: f2 0f 5a c9 cvtsd2ss %xmm1,%xmm1 ab09: f3 0f 5e c1 divss %xmm1,%xmm0 ab0d: f3 0f 5e 44 24 0c divss 0xc(%rsp),%xmm0 ab13: 75 46 jne ab5b ab15: 48 83 c4 58 add $0x58,%rsp ab19: 5b pop %rbx ab1a: 5d pop %rbp ab1b: c3 retq ab1c: 0f 1f 40 00 nopl 0x0(%rax) ab20: 48 89 c8 mov %rcx,%rax ab23: 83 e1 01 and $0x1,%ecx ab26: 66 0f ef c9 pxor %xmm1,%xmm1 ab2a: 48 d1 e8 shr %rax ab2d: 48 09 c8 or %rcx,%rax ab30: f3 48 0f 2a c8 cvtsi2ss %rax,%xmm1 ab35: f3 0f 58 c9 addss %xmm1,%xmm1 ab39: eb 92 jmp aacd ab3b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) ab40: 48 89 d8 mov %rbx,%rax ab43: 83 e3 01 and $0x1,%ebx ab46: 66 0f ef c0 pxor %xmm0,%xmm0 ab4a: 48 d1 e8 shr %rax ab4d: 48 09 d8 or %rbx,%rax ab50: f3 48 0f 2a c0 cvtsi2ss %rax,%xmm0 ab55: f3 0f 58 c0 addss %xmm0,%xmm0 ab59: eb 8c jmp aae7 ab5b: e8 e0 64 ff ff callq 1040 <__stack_chk_fail@plt> 000000000000ab60 : ab60: f3 0f 1e fa endbr64 ab64: 48 8b 35 95 35 00 00 mov 0x3595(%rip),%rsi # e100 ab6b: e9 b9 f3 ff ff jmpq 9f29 000000000000ab70 <__libc_csu_init>: ab70: f3 0f 1e fa endbr64 ab74: 41 57 push %r15 ab76: 4c 8d 3d fb 31 00 00 lea 0x31fb(%rip),%r15 # dd78 <__init_array_start> ab7d: 41 56 push %r14 ab7f: 49 89 d6 mov %rdx,%r14 ab82: 41 55 push %r13 ab84: 49 89 f5 mov %rsi,%r13 ab87: 41 54 push %r12 ab89: 41 89 fc mov %edi,%r12d ab8c: 55 push %rbp ab8d: 48 8d 2d f4 31 00 00 lea 0x31f4(%rip),%rbp # dd88 <__do_global_dtors_aux_fini_array_entry> ab94: 53 push %rbx ab95: 4c 29 fd sub %r15,%rbp ab98: 48 83 ec 08 sub $0x8,%rsp ab9c: e8 5f 64 ff ff callq 1000 <_init> aba1: 48 c1 fd 03 sar $0x3,%rbp aba5: 74 1f je abc6 <__libc_csu_init+0x56> aba7: 31 db xor %ebx,%ebx aba9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) abb0: 4c 89 f2 mov %r14,%rdx abb3: 4c 89 ee mov %r13,%rsi abb6: 44 89 e7 mov %r12d,%edi abb9: 41 ff 14 df callq *(%r15,%rbx,8) abbd: 48 83 c3 01 add $0x1,%rbx abc1: 48 39 dd cmp %rbx,%rbp abc4: 75 ea jne abb0 <__libc_csu_init+0x40> abc6: 48 83 c4 08 add $0x8,%rsp abca: 5b pop %rbx abcb: 5d pop %rbp abcc: 41 5c pop %r12 abce: 41 5d pop %r13 abd0: 41 5e pop %r14 abd2: 41 5f pop %r15 abd4: c3 retq abd5: 66 66 2e 0f 1f 84 00 data16 nopw %cs:0x0(%rax,%rax,1) abdc: 00 00 00 00 000000000000abe0 <__libc_csu_fini>: abe0: f3 0f 1e fa endbr64 abe4: c3 retq Disassembly of section .fini: 000000000000abe8 <_fini>: abe8: f3 0f 1e fa endbr64 abec: 48 83 ec 08 sub $0x8,%rsp abf0: 48 83 c4 08 add $0x8,%rsp abf4: c3 retq ================================================ FILE: InstructionRate/x86_fusion.c ================================================ /* This is a one-off microbenchmark for attempts to figure out what * instructions are fused on Centaur's CNS */ #include #include #include #include #include #include #include // make mingw happy for cross compiling #ifdef __MINGW32__ #define aligned_alloc(align, size) _aligned_malloc(size, align) #endif extern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t testfusion(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t cmpfusion(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t subfusion(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t nopfusion(uint64_t iterations) __attribute((sysv_abi)); float fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 }; float fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 }; int *intTestArr; int intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 }; float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)); int main(int argc, char *argv[]) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t iterations = 1500000000; uint64_t iterationsHigh = iterations * 5; uint64_t time_diff_ms; float latency, opsPerNs, clockSpeedGhz; uint64_t intTestArrLength = 1024; intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength); for (uint64_t i = 0; i < intTestArrLength; i++) { intTestArr[i] = i; } if (argc > 2) { iterationsHigh = 1500000000 * (uint64_t)atol(argv[2]); printf("setting %lu iterations\n", iterationsHigh); } // figure out clock speed gettimeofday(&startTv, &startTz); clktest(iterationsHigh); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh; // clk speed should be 1/latency, assuming we got one add per clk, roughly clockSpeedGhz = 1/latency; printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz); // throughput printf("2-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest)); printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest)); printf("test+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, testfusion)); printf("cmp+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, cmpfusion)); printf("sub+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subfusion)); printf("nop+jnz: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, nopfusion)); return 0; } float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t time_diff_ms, retval; float latency, opsPerNs; gettimeofday(&startTv, &startTz); retval = testfunc(iterations); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; opsPerNs = 1/latency; //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz); //printf("return value: %lu\n", retval); return opsPerNs / clockSpeedGhz; } ================================================ FILE: InstructionRate/x86_fusion.s ================================================ .text .global clktest .global addtest .global noptest .global testfusion .global cmpfusion .global subfusion .global nopfusion testfusion: push %rbx push %r8 push %r9 push %r10 xor %rax, %rax not %rax testfusion_loop: xor %r8, %r8 xor %r9, %r9 sub $5, %rdi test %rdi, %rax jnz testfusion_loop pop %r10 pop %r9 pop %r8 pop %rbx ret cmpfusion: push %rbx push %r8 push %r9 push %r10 xor %rax, %rax cmpfusion_loop: xor %r8, %r8 xor %r9, %r9 sub $5, %rdi cmp %rdi, %rax jnz cmpfusion_loop pop %r10 pop %r9 pop %r8 pop %rbx ret subfusion: push %rbx push %r8 push %r9 push %r10 xor %rax, %rax subfusion_loop: xor %r8, %r8 xor %r9, %r9 xor %r10, %r10 sub $5, %rdi jnz subfusion_loop pop %r10 pop %r9 pop %r8 pop %rbx ret nopfusion: push %rbx push %r8 push %r9 push %r10 xor %rax, %rax nopfusion_loop: sub $5, %rdi nop nop nop jnz nopfusion_loop pop %r10 pop %r9 pop %r8 pop %rbx ret clktest: push %rbx push %r8 push %r9 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx clktest_loop: add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx sub %r9, %rdi jnz clktest_loop pop %r9 pop %r8 pop %rbx ret noptest: push %rbx push %r9 mov $20, %r9 noptest_loop: xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax sub %r9, %rdi jnz noptest_loop pop %r9 pop %rbx ret addtest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 addtest_loop: add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 sub %r9, %rdi jnz addtest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret ================================================ FILE: InstructionRate/x86_instructionrate.c ================================================ /* This is a one-off microbenchmark for attempts to dissect * Zhaoxin's KX-6640MA (LuJiaZui) architecture */ #include #include #include #include #include #include #include #include #include #include // make mingw happy for cross compiling #ifdef __MINGW32__ #define aligned_alloc(align, size) _aligned_malloc(size, align) #endif extern uint64_t noptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t noptest1b(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t clktest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t clkmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addnoptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t leatest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t leamultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t rortest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t shltest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t rorbtstest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixrormultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixrorshltest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t btstest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t btsmultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addmultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t addjmptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t jmpmultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t jmptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t ntjmptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixadd256int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixadd256int11(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixadd256fpint(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mix256fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mix256fp11(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latadd512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latadd256int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latadd128int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latadd256fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul128int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul256int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmulq512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmuldq512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul256fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latadd128fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul128fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latfma512(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latfma256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latfma128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t add128int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t add256int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t add512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t muldq512int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul128int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t add128fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul128fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma512(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfma256fma512(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mix21fma256fma512(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfmafadd256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfmaadd256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfmaadd512(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfma512add256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfmaand256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixfmaandmem256(uint64_t iterations, float *arr) __attribute((sysv_abi)); extern uint64_t mixfmaaddmem256(uint64_t iterations, float *arr) __attribute((sysv_abi)); extern uint64_t nemesfpumix21(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t nemesfpu512mix21(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul256fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t add256fp(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul64(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t latmul16(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul16(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mul64(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t load128(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t spacedload128(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t loadscalar(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t mixedscalarloadstore(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t load256(uint64_t iterations, float *arr) __attribute((sysv_abi)); extern uint64_t load512(uint64_t iterations, float *arr) __attribute((sysv_abi)); extern uint64_t store128(uint64_t iterations, int *arr, int *sink) __attribute((sysv_abi)); extern uint64_t store256(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi)); extern uint64_t store512(uint64_t iterations, float *arr, float *sink) __attribute((sysv_abi)); extern uint64_t mixaddmul128int(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixmul16mul64(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mixmul16mul64_21(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t pdeptest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t pdepmultest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t pexttest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t indepmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t vecindepmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t depmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t vecdepmovtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t xorzerotest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t vecxorzerotest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t movzerotest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t subzerotest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t vecsubzerotest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t depinctest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t depdectest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t depaddimmtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t memrenametest(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t spacedstorescalar(uint64_t iterations, int *arr) __attribute((sysv_abi)); extern uint64_t aesenc128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesdec128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencfadd128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencadd128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencfma128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t aesencmul128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t mix256faddintadd(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t movqtoxmmtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma4_256(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fma4_128(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fdivtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fdivlattest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fmuldenormtest(uint64_t iterations) __attribute((sysv_abi)); extern uint64_t fmuldenormlattest(uint64_t iterations) __attribute((sysv_abi)); float fpTestArr[8] __attribute__ ((aligned (64))) = { 0.2, 1.5, 2.7, 3.14, 5.16, 6.3, 7.7, 9.45 }; float fpSinkArr[8] __attribute__ ((aligned (64))) = { 2.1, 3.2, 4.3, 5.4, 6.2, 7.8, 8.3, 9.4 }; int *intTestArr; int intSinkArr[8] __attribute__ ((aligned (64))) = { 2, 3, 4, 5, 6, 7, 8, 9 }; uint64_t load128wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t loadscalarwrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t mixedscalarloadstorewrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t spacedload128wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t spacedstorescalarwrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t load256wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t load512wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t store128wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t store256wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t store512wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t mixfmaandmem256wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t mixfmaaddmem256wrapper(uint64_t iterations) __attribute((sysv_abi)); uint64_t memrenamewrapper(uint64_t iterations) __attribute((sysv_abi)); float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)); int threads = 0; int main(int argc, char *argv[]) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t iterations = 1500000000; uint64_t iterationsHigh = iterations * 5; uint64_t time_diff_ms; float latency, opsPerNs, clockSpeedGhz; uint64_t intTestArrLength = 1024; int avxSupported = 0, avx2Supported = 0, bmi2Supported = 0, avx512Supported = 0; int fmaSupported = 0, fma4Supported = 0; char *testName = NULL; if (argc > 1) { for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "threads", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); fprintf(stderr, "Multithreading mode, %d threads\n", threads); } else if (strncmp(arg, "iter", 4) == 0) { argIdx++; int iterMul = atoi(argv[argIdx]); iterations *= iterMul; iterationsHigh *= iterMul; fprintf(stderr, "Scaled iterations by %d\n", iterMul); } else if (strncmp(arg, "test", 4) == 0) { argIdx++; testName = argv[argIdx]; fprintf(stderr, "Only running test %s\n", testName); } } } } intTestArr = aligned_alloc(64, sizeof(int) * intTestArrLength); for (uint64_t i = 0; i < intTestArrLength; i++) { intTestArr[i] = i; } if (__builtin_cpu_supports("avx")) { fprintf(stderr, "avx supported\n"); avxSupported = 1; } if (__builtin_cpu_supports("avx2")) { fprintf(stderr, "avx2 supported\n"); avx2Supported = 1; } if (__builtin_cpu_supports("bmi2")) { fprintf(stderr, "bmi2 supported\n"); bmi2Supported = 1; } if (__builtin_cpu_supports("fma")) { fprintf(stderr, "fma3 supported\n"); fmaSupported = 1; } if (__builtin_cpu_supports("fma4")) { fprintf(stderr, "fma4 supported\n"); fma4Supported = 1; } uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx; __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx); if (cpuidEbx & (1UL << 16)) { fprintf(stderr, "AVX512 supported\n"); avx512Supported = 1; } // figure out clock speed gettimeofday(&startTv, &startTz); clktest(iterationsHigh); gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterationsHigh; // clk speed should be 1/latency, assuming we got one add per clk, roughly clockSpeedGhz = 1/latency; printf("Estimated clock speed: %.2f GHz\n", clockSpeedGhz); // avx-512 testing if (avx512Supported) { if (testName == NULL || argc > 1 && strncmp(argv[1], "fma512", 6) == 0) printf("512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma512)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latfma512", 9) == 0) printf("512-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma512)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfma256fma512", 15) == 0) printf("1:1 256-bit/512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixfma256fma512)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mix21fma256fma512", 17) == 0) printf("2:1 256-bit/512-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mix21fma256fma512)); if (testName == NULL || argc > 1 && strncmp(argv[1], "nemesfpumix21", 13) == 0) printf("1:2 512b FMA:FADD per clk (nemes): %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, nemesfpu512mix21)); if (testName == NULL || argc > 1 && strncmp(argv[1], "add512int", 9) == 0) printf("512-bit int add per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, add512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd512int", 12) == 0) printf("512-bit int add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mul512int", 9) == 0) printf("512-bit 32-bit int mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "muldq512int", 9) == 0) printf("512-bit 32->64-bit int mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, muldq512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmulq512int", 13) == 0) printf("512-bit 64-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmulq512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul512int", 12) == 0) printf("512-bit 32-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmul512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmuldq512int", 13) == 0) printf("512-bit 32->64-bit int mul latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latmuldq512int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaadd512", 11) == 0) printf("1:2 512b PADDQ:FMA per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd512)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixfma512add256", 11) == 0) printf("1:2 256b PADDQ : 512b FMA per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfma512add256)); if (testName == NULL || argc > 1 && strncmp(argv[1], "load512", 7) == 0) printf("512-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load512wrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "store512", 7) == 0) printf("512-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store512wrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesenc128", 9) == 0) printf("aesenc per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesenc128)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesdec128", 9) == 0) printf("aesdec per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesdec128)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencadd128", 12) == 0) printf("1:3 aesenc+paddd per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencadd128)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencfma128", 12) == 0) printf("1:2 aesenc+fma per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencfma128)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencmul128", 12) == 0) printf("1:2 aesenc+pmullw per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencmul128)); if (testName == NULL || argc > 1 && strncmp(argv[1], "aesencmul128", 12) == 0) printf("1:2 aesenc+addps per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, aesencfadd128)); } // throughput if (testName == NULL || argc > 1 && strncmp(argv[1], "1bnop", 5) == 0) printf("1-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest1b)); if (testName == NULL || argc > 1 && strncmp(argv[1], "2bnop", 5) == 0) printf("2-byte nops per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, noptest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "add", 3) == 0) printf("Adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "addnop", 7) == 0) printf("1:4 nops/adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "addmov", 7) == 0) printf("1:4 movs/adds per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addnoptest)); // renamer throughput printf("--- Renamer tests ---\n"); if (testName == NULL || argc > 1 && strncmp(argv[1], "depmov", 6) == 0) printf("Dependent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depmovtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "indepmov", 8) == 0) printf("Independent movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, indepmovtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "xorzero", 7) == 0) printf("xor -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, xorzerotest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "movzero", 7) == 0) printf("mov -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, movzerotest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "subzero", 7) == 0) printf("sub -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, subzerotest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "depinc", 6) == 0) printf("dep inc per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depinctest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "depdec", 6) == 0) printf("dep dec per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depdectest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "depdec", 6) == 0) printf("dep add immediate per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, depaddimmtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "clkmov", 6) == 0) printf("dep add + mov pair per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, clkmovtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "vecdepmov", 9) == 0) printf("Dependent vec movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecdepmovtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "vecindepmov", 12) == 0) printf("Independent vec movs per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecindepmovtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "vecxorzero", 10) == 0) printf("xor xmm -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecxorzerotest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "vecsubzero", 10) == 0) printf("sub xmm -> 0 per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, vecsubzerotest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "memrename", 9) == 0) printf("mov -> [r] -> mov latency: %.2f\n", 1 / measureFunction(iterations, clockSpeedGhz, memrenamewrapper)); // misc mixed integer tests if (testName == NULL || argc > 1 && strncmp(argv[1], "miximuladd", 10) == 0) printf("4:1 adds/imul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addmultest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "jmpmul", 6) == 0) printf("1:1 mul/jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmpmultest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "addjmp", 6) == 0) printf("3:1 add/jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, addjmptest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "jmp", 3) == 0) printf("taken jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, jmptest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "ntjmp", 5) == 0) printf("nt jmp per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, ntjmptest)); if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pdep", 4) == 0)) printf("pdep per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pdeptest)); if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pext", 4) == 0)) printf("pext per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pexttest)); if (bmi2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "pdepmul", 7) == 0)) printf("1:1 pdep/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, pdepmultest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "shl", 3) == 0) printf("shl r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, shltest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "ror", 3) == 0) printf("ror r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, rortest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrorshl", 9) == 0) printf("1:1 shl/ror r,1 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixrorshltest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrormul", 3) == 0) printf("1:1 ror/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixrormultest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "bts", 3) == 0) printf("bts per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, btstest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmulbts", 9) == 0) printf("1:1 bts/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, btsmultest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixrorbts", 9) == 0) printf("1:1 bts/ror per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, rorbtstest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "lea", 3) == 0) printf("lea r+r*8 per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, leatest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmullea", 9) == 0) printf("1:1 lea r+r*8/mul per clk: %.4f\n", measureFunction(iterationsHigh, clockSpeedGhz, leamultest)); // vector and FP if (testName == NULL || argc > 1 && strncmp(argv[1], "fdiv", 4) == 0) printf("divss per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fdivtest)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latfdiv", 7) == 0) printf("divss latency: %.2f\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, fdivlattest)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "avx256int", 9) == 0)) printf("256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add256int)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixavx256int", 12) == 0)) printf("2:1 scalar add/256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix11avx256int", 14) == 0)) printf("1:1 scalar add/256-bit avx integer add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256int11)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixavx256fpint", 14) == 0)) printf("1:1 256-bit avx int add/avx fadd per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mixadd256fpint)); if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix256fp", 8) == 0)) printf("1:1 256-bit avx fp mul/add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mix256fp)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latadd256int", 12) == 0)) printf("256-bit avx2 integer add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd256int)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latmul256int", 12) == 0)) printf("256-bit avx2 integer multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul256int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd128int", 12) == 0) printf("128-bit sse integer add latency: %.2f clocks\n", 1 / measureFunction(iterationsHigh, clockSpeedGhz, latadd128int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul128int", 12) == 0) printf("128-bit sse integer multiply latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul128int)); if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "latadd256fp", 11) == 0)) printf("256-bit avx fadd latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latadd256fp)); if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "latmul256fp", 11) == 0)) printf("256-bit avx fmul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul256fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latadd128fp", 11) == 0) printf("128-bit sse fadd latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latadd128fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul128fp", 11) == 0) printf("128-bit sse fmul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul128fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "add128fp", 8) == 0) printf("128-bit sse fadd per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add128fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mul128fp", 8) == 0) printf("128-bit sse fmul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul128fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "add128int", 9) == 0) printf("128-bit sse int add per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, add128int)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mul128int", 9) == 0) printf("128-bit sse int mul per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, mul128int)); // set no ftz or daz _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); if (argc == 1 || argc > 1 && strncmp(argv[1], "fmuldenorm", 10) == 0) { float denormTp = measureFunction(iterations, clockSpeedGhz, fmuldenormtest); printf("Scalar FP32 multiply -> denorm per clk: %.2f (%.2f recip)\n", denormTp, 1/denormTp); } _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); if (argc == 1 || argc > 1 && strncmp(argv[1], "fmuldenormftz", 13) == 0) { printf("Scalar FP32 multiply -> denorm (ftz/daz) per clk: %.2f\n", measureFunction(iterationsHigh, clockSpeedGhz, fmuldenormtest)); } if (fmaSupported) { if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fma256", 6) == 0)) printf("256-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma256)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fma128", 6) == 0)) printf("128-bit FMA per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma128)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latfma256", 9) == 0)) printf("256-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma256)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "latfma128", 9) == 0)) printf("128-bit FMA latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latfma128)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmafadd256", 12) == 0)) printf("1:2 256b FMA:FADD per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmafadd256)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaadd256", 11) == 0)) printf("2:1 256b FMA:PADDQ per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaadd256)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaandmem256", 14) == 0)) printf("2:1 256b FMA:PADDQ load-op per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaaddmem256wrapper)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaand256", 11) == 0)) printf("2:1 256b FMA:PAND per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaand256)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mixfmaandmem256", 14) == 0)) printf("2:1 256b FMA:PAND load-op per clk: %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, mixfmaandmem256wrapper)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "nemesfpumix21", 13) == 0)) printf("1:2 256b FMA:FADD per clk (nemes): %.2f\n", measureFunction(iterations * 22, clockSpeedGhz, nemesfpumix21)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "mix256faddintadd", 15) == 0)) printf("1:2 256b FMA:PADD per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mix256faddintadd)); } if (fma4Supported) { if (testName == NULL || argc > 1 && strncmp(argv[1], "fma4_256", 8) == 0) printf("256-bit FMA4 per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma4_256)); if (testName == NULL || argc > 1 && strncmp(argv[1], "fma4_256", 8) == 0) printf("128-bit FMA4 per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, fma4_128)); } if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fadd256", 6) == 0)) printf("256-bit FADD per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, add256fp)); if (avx2Supported && (testName == NULL || argc > 1 && strncmp(argv[1], "fmul256", 6) == 0)) printf("256-bit FMUL per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul256fp)); if (testName == NULL || argc > 1 && strncmp(argv[1], "movqtoxmm", 9) == 0) printf("MOVQ GPR <-> XMM: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, movqtoxmmtest)); // integer multiply. zhaoxin appears to handle 16-bit and 64-bit multiplies differntly // unlike Intel/AMD CPUs that behave similarly regardless of register width if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul16", 8) == 0) printf("16-bit imul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul16)); if (testName == NULL || argc > 1 && strncmp(argv[1], "latmul64", 8) == 0) printf("64-bit imul latency: %.2f clocks\n", 1 / measureFunction(iterations, clockSpeedGhz, latmul64)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mul16", 5) == 0) printf("16-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul16)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mul64", 5) == 0) printf("64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mul64)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixmul16mul64", 5) == 0) printf("1:1 mixed 16-bit/64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixmul16mul64)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mix21mul16mul64", 5) == 0) printf("2:1 mixed 16-bit/64-bit imul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixmul16mul64_21)); // load/store if (testName == NULL || argc > 1 && strncmp(argv[1], "loadscalar", 10) == 0) printf("64-bit scalar loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, loadscalarwrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixedscalarloadstore", 20) == 0) printf("2:1 64-bit scalar loads:stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixedscalarloadstorewrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "load128", 7) == 0) printf("128-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load128wrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "spacedload128", 13) == 0) printf("128-bit loads (spaced) per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, spacedload128wrapper)); if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "load256", 7) == 0)) printf("256-bit loads per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, load256wrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "spacedstorescalar", 13) == 0) printf("scalar stores (spaced) per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, spacedstorescalarwrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "store128", 7) == 0) printf("128-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store128wrapper)); if (avxSupported && (testName == NULL || argc > 1 && strncmp(argv[1], "store256", 7) == 0)) printf("256-bit stores per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, store256wrapper)); if (testName == NULL || argc > 1 && strncmp(argv[1], "mixaddmul128int", 15) == 0) printf("1:1 mixed 128-bit vec add/mul per clk: %.2f\n", measureFunction(iterations, clockSpeedGhz, mixaddmul128int)); return 0; } struct TestThreadData { uint64_t iterations; uint64_t (*testfunc)(uint64_t); }; void *TestThread(void *param) { struct TestThreadData *testData = (struct TestThreadData *)param; testData->testfunc(testData->iterations); return NULL; } float measureFunction(uint64_t iterations, float clockSpeedGhz, __attribute((sysv_abi)) uint64_t (*testfunc)(uint64_t)) { struct timeval startTv, endTv; struct timezone startTz, endTz; uint64_t time_diff_ms, retval; float latency, opsPerNs; gettimeofday(&startTv, &startTz); if (threads == 0) retval = testfunc(iterations); else { pthread_t *testThreads = (pthread_t *)malloc(threads * sizeof(pthread_t)); struct TestThreadData *testData = (struct TestThreadData*)malloc(threads * sizeof(struct TestThreadData)); for (int threadIdx = 0; threadIdx < threads; threadIdx++) { testData[threadIdx].iterations = iterations; testData[threadIdx].testfunc = testfunc; pthread_create(testThreads + threadIdx, NULL, TestThread, testData + threadIdx); } for (int threadIdx = 0; threadIdx < threads; threadIdx++) { pthread_join(testThreads[threadIdx], NULL); } free(testThreads); free(testData); } gettimeofday(&endTv, &endTz); time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); latency = 1e6 * (float)time_diff_ms / (float)iterations; opsPerNs = 1/latency; //printf("%f adds/ns, %f adds/clk?\n", opsPerNs, opsPerNs / clockSpeedGhz); //printf("return value: %lu\n", retval); return opsPerNs / clockSpeedGhz; } __attribute((sysv_abi)) uint64_t load128wrapper(uint64_t iterations) { return load128(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t spacedload128wrapper(uint64_t iterations) { return spacedload128(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t spacedstorescalarwrapper(uint64_t iterations) { return spacedstorescalar(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t load256wrapper(uint64_t iterations) { return load256(iterations, fpTestArr); } __attribute((sysv_abi)) uint64_t loadscalarwrapper(uint64_t iterations) { return loadscalar(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t mixedscalarloadstorewrapper(uint64_t iterations) { return mixedscalarloadstore(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t load512wrapper(uint64_t iterations) { return load512(iterations, fpTestArr); } __attribute((sysv_abi)) uint64_t spacedload256wrapper(uint64_t iterations) { return spacedload128(iterations, intTestArr); } __attribute((sysv_abi)) uint64_t store128wrapper(uint64_t iterations) { return store128(iterations, intTestArr, intSinkArr); } __attribute((sysv_abi)) uint64_t store256wrapper(uint64_t iterations) { return store256(iterations, fpTestArr, fpSinkArr); } __attribute((sysv_abi)) uint64_t store512wrapper(uint64_t iterations) { return store512(iterations, fpTestArr, fpSinkArr); } __attribute((sysv_abi)) uint64_t mixfmaandmem256wrapper(uint64_t iterations) { return mixfmaandmem256(iterations, fpTestArr); } __attribute((sysv_abi)) uint64_t mixfmaaddmem256wrapper(uint64_t iterations) { return mixfmaaddmem256(iterations, fpTestArr); } __attribute((sysv_abi)) uint64_t memrenamewrapper(uint64_t iterations) { return memrenametest(iterations, intSinkArr); } ================================================ FILE: InstructionRate/x86_instructionrate.s ================================================ .text .global clktest .global clkmovtest .global addtest .global addnoptest .global addmovtest .global rortest .global shltest .global mixrorshltest .global mixrormultest .global btstest .global leatest .global leamultest .global rorbtstest .global btsmultest .global depmovtest .global indepmovtest .global vecindepmovtest .global vecdepmovtest .global xorzerotest .global vecxorzerotest .global movzerotest .global subzerotest .global vecsubzerotest .global depinctest .global depdectest .global depaddimmtest .global memrenametest .global addmultest .global jmpmultest .global addjmptest .global jmptest .global ntjmptest .global noptest .global noptest1b .global add256int .global add512int .global mul512int .global muldq512int .global mixadd256int .global mixadd256int11 .global mixadd256fpint .global mix256fp .global latadd256int .global latadd128int .global latmul256int .global latmul512int .global latmulq512int .global latmuldq512int .global latmul128int .global latadd256int .global latmul256fp .global latadd256fp .global latmul128fp .global latadd128fp .global fma512 .global mixfma256fma512 .global mix21fma256fma512 .global fma256 .global fma128 .global mixfmafadd256 .global mixfmaadd256 .global mixfmaadd512 .global mixfma512add256 .global mixfmaand256 .global nemesfpumix21 .global nemesfpu512mix21 .global mixfmaandmem256 .global mixfmaaddmem256 .global latfma512 .global latfma256 .global latfma128 .global mul256fp .global add256fp .global add128fp .global mul128fp .global latmul64 .global latmul16 .global mul16 .global mul64 .global load128 .global spacedload128 .global load256 .global load512 .global store128 .global store256 .global store512 .global loadscalar .global mixedscalarloadstore .global spacedstorescalar .global mixaddmul128int .global mixmul16mul64 .global mixmul16mul64_21 .global add128int .global mul128int .global mix256faddintadd .global movqtoxmmtest .global pdeptest .global pexttest .global pdepmultest .global aesenc128 .global aesdec128 .global aesencadd128 .global aesencfma128 .global aesencfadd128 .global aesencmul128 .global fma4_256 .global fma4_128 .global fdivtest .global fdivlattest .global fmuldenormtest .global fmuldenormlattest /* %rdi = arg0 = iteration count */ clktest: push %rbx push %r8 push %r9 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx clktest_loop: add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx add %r8, %rbx sub %r9, %rdi jnz clktest_loop pop %r9 pop %r8 pop %rbx ret clkmovtest: push %rbx push %r8 push %r9 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx clkmovtest_loop: add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 add %r8, %rbx mov %rbx, %r8 sub %r9, %rdi jnz clkmovtest_loop pop %r9 pop %r8 pop %rbx ret noptest: push %rbx push %r9 mov $20, %r9 noptest_loop: xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax xchg %ax,%ax sub %r9, %rdi jnz noptest_loop pop %r9 pop %rbx ret noptest1b: push %rbx push %r9 mov $20, %r9 noptest1b_loop: nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop nop sub %r9, %rdi jnz noptest1b_loop pop %r9 pop %rbx ret addtest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 addtest_loop: add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 add %r8, %r10 sub %r9, %rdi jnz addtest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret addnoptest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 addnoptest_loop: add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 nop add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 nop add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx nop add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 nop sub %r9, %rdi jnz addnoptest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret addmovtest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 addmovtest_loop: add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 mov %r15, %rdx add %r8, %r10 add %r8, %rcx add %r8, %r15 add %r8, %r14 mov %r15, %rdx add %r8, %r12 add %r8, %r11 add %r8, %r10 add %r8, %rcx mov %r15, %rdx add %r8, %r14 add %r8, %r13 add %r8, %r12 add %r8, %r11 mov %r15, %rdx sub %r9, %rdi jnz addmovtest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret rortest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 rortest_loop: ror $1, %r15 ror $1, %r14 ror $1, %r13 ror $1, %r12 ror $1, %r11 ror $1, %r15 ror $1, %r14 ror $1, %r13 ror $1, %r12 ror $1, %r11 ror $1, %r15 ror $1, %r14 ror $1, %r13 ror $1, %r12 ror $1, %r11 ror $1, %r15 ror $1, %r14 ror $1, %r13 ror $1, %r12 ror $1, %r11 sub %r9, %rdi jnz rortest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret shltest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 shltest_loop: shl $1, %r15 shl $1, %r14 shl $1, %r13 shl $1, %r12 shl $1, %r11 shl $1, %r15 shl $1, %r14 shl $1, %r13 shl $1, %r12 shl $1, %r11 shl $1, %r15 shl $1, %r14 shl $1, %r13 shl $1, %r12 shl $1, %r11 shl $1, %r15 shl $1, %r14 shl $1, %r13 shl $1, %r12 shl $1, %r11 sub %r9, %rdi jnz shltest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret mixrorshltest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 mixrorshltest_loop: ror $1, %r15 shl $1, %r14 ror $1, %r13 shl $1, %r12 ror $1, %r11 shl $1, %r15 ror $1, %r14 shl $1, %r13 ror $1, %r12 shl $1, %r11 ror $1, %r15 shl $1, %r14 ror $1, %r13 shl $1, %r12 ror $1, %r11 shl $1, %r15 ror $1, %r14 shl $1, %r13 ror $1, %r12 shl $1, %r11 sub %r9, %rdi jnz mixrorshltest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret mixrormultest: push %rbx push %rcx push %rsi push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $3, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 mixrormultest_loop: ror $1, %r15 imul %r8, %r14 mov %r9, %r14 ror $1, %r13 imul %r8, %r12 mov %r9, %r12 ror $1, %r11 imul %r8, %r10 mov %r9, %r10 ror $1, %rbx imul %r8, %rcx mov %r9, %rcx ror $1, %rsi imul %r8, %rax mov %r9, %rax ror $1, %r15 imul %r8, %r14 mov %r9, %r14 ror $1, %r13 imul %r8, %r12 mov %r9, %r12 ror $1, %r11 imul %r8, %r10 mov %r9, %r10 ror $1, %rbx imul %r8, %rcx mov %r9, %rcx ror $1, %rsi imul %r8, %rdx sub %r9, %rdi jnz mixrormultest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rsi pop %rcx pop %rbx ret rorbtstest: push %rbx push %rcx push %rdx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 inc %r8 rorbtstest_loop: bts %r8, %r15 ror $1, %r14 bts %r8, %r13 ror $1, %r12 bts %r8, %r11 ror $1, %r10 bts %r8, %rcx ror $1, %rbx bts %r8, %rdx ror $1, %rsi bts %r8, %r15 ror $1, %r14 bts %r8, %r13 ror $1, %r12 bts %r8, %r11 ror $1, %r10 bts %r8, %rcx ror $1, %rbx bts %r8, %rdx ror $1, %rsi sub %r9, %rdi jnz rorbtstest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rdx pop %rcx pop %rbx ret btstest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 inc %r8 btstest_loop: bts %r8, %r15 bts %r8, %r14 bts %r8, %r13 bts %r8, %r12 bts %r8, %r11 bts %r8, %r15 bts %r8, %r14 bts %r8, %r13 bts %r8, %r12 bts %r8, %r11 bts %r8, %r15 bts %r8, %r14 bts %r8, %r13 bts %r8, %r12 bts %r8, %r11 bts %r8, %r15 bts %r8, %r14 bts %r8, %r13 bts %r8, %r12 bts %r8, %r11 sub %r9, %rdi jnz btstest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret leatest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 inc %r8 leatest_loop: lea (%r9,%r10,8), %r10 lea (%r9,%r11,8), %r11 lea (%r9,%r12,8), %r12 lea (%r9,%r13,8), %r13 lea (%r9,%r14,8), %r14 lea (%r9,%r15,8), %r15 lea (%r9,%r10,8), %r10 lea (%r9,%r11,8), %r11 lea (%r9,%r12,8), %r12 lea (%r9,%r13,8), %r13 lea (%r9,%r14,8), %r14 lea (%r9,%r15,8), %r15 lea (%r9,%r10,8), %r10 lea (%r9,%r11,8), %r11 lea (%r9,%r12,8), %r12 lea (%r9,%r13,8), %r13 lea (%r9,%r14,8), %r14 lea (%r9,%r15,8), %r15 lea (%r9,%r10,8), %r10 lea (%r9,%r11,8), %r11 sub %r9, %rdi jnz leatest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret leamultest: push %rbx push %rcx push %rdx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 inc %r8 leamultest_loop: lea (%r9,%r15,8), %r15 imul %r8, %r14 mov %r8, %r14 lea (%r9,%r13,8), %r13 imul %r8, %r12 mov %r8, %r12 lea (%r9,%r11,8), %r11 imul %r8, %r10 mov %r8, %r10 lea (%r9,%rbx,8), %rbx imul %r8, %rcx mov %r8, %rcx lea (%r9,%rdx,8), %rdx imul %r8, %rax lea (%r9,%r15,8), %r15 imul %r8, %r14 lea (%r9,%r13,8), %r13 imul %r8, %r12 lea (%r9,%r11,8), %r11 imul %r8, %r10 lea (%r9,%rbx,8), %rbx imul %r8, %rcx lea (%r9,%rdx,8), %rdx imul %r8, %rax sub %r9, %rdi jnz leamultest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rdx pop %rcx pop %rbx ret btsmultest: push %rbx push %rcx push %rsi push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rdx mov %r8, %rsi mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r8, %r15 inc %r8 btsmultest_loop: imul %r8, %r14 bts %r8, %r13 mov %r8, %r13 imul %r8, %r12 bts %r8, %r11 mov %r8, %r11 imul %r8, %r10 bts %r8, %rbx imul %r8, %rcx mov %r8, %rcx bts %r8, %rsi imul %r8, %rax mov %r8, %rax bts %r8, %r15 imul %r8, %r14 mov %r8, %r14 bts %r8, %r13 imul %r8, %r12 mov %r8, %r12 bts %r8, %r11 imul %r8, %r10 mov %r8, %r10 bts %r8, %rbx imul %r8, %rcx mov %r8, %rcx bts %r8, %rsi imul %r8, %rdx mov %r8, %rdx bts %r8, %r11 sub %r9, %rdi jnz btsmultest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rsi pop %rcx pop %rbx ret jmptest: push %rsi push %rbx push %rcx push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 xor %rsi, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %rsi mov %r8, %rax mov %r8, %rdx jmptest_loop: jmp jmptest1 add $1, %rax jmptest1: jmp jmptest2 add $2, %rax jmptest2: jmp jmptest3 add $3, %rax jmptest3: jmp jmptest4 add $4, %rax jmptest4: jmp jmptest5 add $5, %rax jmptest5: jmp jmptest6 add $6, %rax jmptest6: jmp jmptest7 add $7, %rax jmptest7: jmp jmptest8 add $8, %rax jmptest8: jmp jmptest9 add $9, %rax jmptest9: jmp jmptest10 add $10, %rax jmptest10: jmp jmptest11 add $11, %rax jmptest11: jmp jmptest12 add $12, %rax jmptest12: jmp jmptest13 add $13, %rax jmptest13: jmp jmptest14 add $14, %rax jmptest14: jmp jmptest15 add $15, %rax jmptest15: jmp jmptest16 add $16, %rax jmptest16: jmp jmptest17 add $17, %rax jmptest17: jmp jmptest18 add $18, %rax jmptest18: jmp jmptest19 add $19, %rax jmptest19: /* jump back counts as nr 20 */ sub %r9, %rdi jnz jmptest_loop jmptest_jellydonut: pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rcx pop %rbx pop %rsi ret ntjmptest: push %rsi push %rbx push %rcx push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 xor %rsi, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %rsi mov %r8, %rax mov %r8, %rdx ntjmptest_loop: cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut cmp %r8, %r9 je jmpmultest_jellydonut sub %r9, %rdi jnz ntjmptest_loop ntjmptest_jellydonut: pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rcx pop %rbx pop %rsi ret addjmptest: push %rsi push %rbx push %rcx push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $2, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 xor %rsi, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %rsi mov %r8, %rax mov %r8, %rdx addjmptest_loop: add %r8, %r10 add %r11, %r12 add %r13, %r14 jnz addjmptest_jellydonut add %r8, %r10 add %r11, %r12 add %r13, %r14 jnz addjmptest_jellydonut add %r8, %r10 add %r11, %r12 add %r13, %r14 jnz addjmptest_jellydonut add %r8, %r10 add %r11, %r12 add %r13, %r14 jnz addjmptest_jellydonut add %r8, %r10 add %r11, %r12 add %r13, %r14 jnz addjmptest_jellydonut sub %r9, %rdi jnz addjmptest_loop addjmptest_jellydonut: pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rcx pop %rbx pop %rsi ret jmpmultest: push %rsi push %rbx push %rcx push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $2, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 xor %rsi, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %rsi mov %r8, %rax mov %r8, %rdx jmpmultest_loop: cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %r10d cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %esi cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %ebx cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %edx cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %r10d cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %esi cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %ebx cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %edx cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %r15d cmp %r8, %r9 je jmpmultest_jellydonut imul %r8d, %r14d sub %r9, %rdi jnz jmpmultest_loop jmpmultest_jellydonut: pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rcx pop %rbx pop %rsi ret addmultest: push %rsi push %rbx push %rcx push %rdx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $40, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 xor %rsi, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %rsi mov %r8, %rax mov %r8, %rdx addmultest_loop: add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %r10 add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %rsi add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %rbx add %r8, %r15 add %r8, %r15 add %r8, %r13 add %r8, %r12 imul %r8, %rdx add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %r10 add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %rsi add %r8, %r15 add %r8, %r14 add %r8, %r13 add %r8, %r12 imul %r8, %rbx add %r8, %r15 add %r8, %r13 add %r8, %r12 imul %r8, %rdx sub %r9, %rdi jnz addmultest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rdx pop %rcx pop %rbx pop %rsi ret add256int: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastq %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 add256int_loop: vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 sub %r9, %rdi jnz add256int_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mul512int: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 mul512int_loop: vpmulld %zmm0, %zmm1, %zmm1 vpmulld %zmm0, %zmm2, %zmm2 vpmulld %zmm0, %zmm3, %zmm3 vpmulld %zmm0, %zmm4, %zmm4 vpmulld %zmm0, %zmm5, %zmm5 vpmulld %zmm0, %zmm1, %zmm1 vpmulld %zmm0, %zmm2, %zmm2 vpmulld %zmm0, %zmm3, %zmm3 vpmulld %zmm0, %zmm4, %zmm4 vpmulld %zmm0, %zmm5, %zmm5 vpmulld %zmm0, %zmm1, %zmm1 vpmulld %zmm0, %zmm2, %zmm2 vpmulld %zmm0, %zmm3, %zmm3 vpmulld %zmm0, %zmm4, %zmm4 vpmulld %zmm0, %zmm5, %zmm5 vpmulld %zmm0, %zmm1, %zmm1 vpmulld %zmm0, %zmm2, %zmm2 vpmulld %zmm0, %zmm3, %zmm3 vpmulld %zmm0, %zmm4, %zmm4 vpmulld %zmm0, %zmm5, %zmm5 sub %r9, %rdi jnz mul512int_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret muldq512int: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 muldq512int_loop: vpmuldq %zmm0, %zmm1, %zmm1 vpmuldq %zmm0, %zmm2, %zmm2 vpmuldq %zmm0, %zmm3, %zmm3 vpmuldq %zmm0, %zmm4, %zmm4 vpmuldq %zmm0, %zmm5, %zmm5 vpmuldq %zmm0, %zmm1, %zmm1 vpmuldq %zmm0, %zmm2, %zmm2 vpmuldq %zmm0, %zmm3, %zmm3 vpmuldq %zmm0, %zmm4, %zmm4 vpmuldq %zmm0, %zmm5, %zmm5 vpmuldq %zmm0, %zmm1, %zmm1 vpmuldq %zmm0, %zmm2, %zmm2 vpmuldq %zmm0, %zmm3, %zmm3 vpmuldq %zmm0, %zmm4, %zmm4 vpmuldq %zmm0, %zmm5, %zmm5 vpmuldq %zmm0, %zmm1, %zmm1 vpmuldq %zmm0, %zmm2, %zmm2 vpmuldq %zmm0, %zmm3, %zmm3 vpmuldq %zmm0, %zmm4, %zmm4 vpmuldq %zmm0, %zmm5, %zmm5 sub %r9, %rdi jnz muldq512int_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret add512int: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 add512int_loop: vpaddq %zmm0, %zmm1, %zmm1 vpaddq %zmm0, %zmm2, %zmm2 vpaddq %zmm0, %zmm3, %zmm3 vpaddq %zmm0, %zmm4, %zmm4 vpaddq %zmm0, %zmm5, %zmm5 vpaddq %zmm0, %zmm1, %zmm1 vpaddq %zmm0, %zmm2, %zmm2 vpaddq %zmm0, %zmm3, %zmm3 vpaddq %zmm0, %zmm4, %zmm4 vpaddq %zmm0, %zmm5, %zmm5 vpaddq %zmm0, %zmm1, %zmm1 vpaddq %zmm0, %zmm2, %zmm2 vpaddq %zmm0, %zmm3, %zmm3 vpaddq %zmm0, %zmm4, %zmm4 vpaddq %zmm0, %zmm5, %zmm5 vpaddq %zmm0, %zmm1, %zmm1 vpaddq %zmm0, %zmm2, %zmm2 vpaddq %zmm0, %zmm3, %zmm3 vpaddq %zmm0, %zmm4, %zmm4 vpaddq %zmm0, %zmm5, %zmm5 sub %r9, %rdi jnz add512int_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixadd256fpint: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %ymm6 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 mixadd256fpint_loop: vpaddq %ymm0, %ymm1, %ymm1 vaddps %ymm6, %ymm7, %ymm7 vpaddq %ymm0, %ymm2, %ymm2 vaddps %ymm6, %ymm8, %ymm8 vpaddq %ymm0, %ymm3, %ymm3 vaddps %ymm6, %ymm9, %ymm9 vpaddq %ymm0, %ymm4, %ymm4 vaddps %ymm6, %ymm10, %ymm10 vpaddq %ymm0, %ymm5, %ymm5 vaddps %ymm6, %ymm11, %ymm11 vpaddq %ymm0, %ymm1, %ymm1 vaddps %ymm6, %ymm7, %ymm7 vpaddq %ymm0, %ymm2, %ymm2 vaddps %ymm6, %ymm8, %ymm8 vpaddq %ymm0, %ymm3, %ymm3 vaddps %ymm6, %ymm9, %ymm9 vpaddq %ymm0, %ymm4, %ymm4 vaddps %ymm6, %ymm10, %ymm10 vpaddq %ymm0, %ymm5, %ymm5 vaddps %ymm6, %ymm11, %ymm11 sub %r9, %rdi jnz mixadd256fpint_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mix256faddintadd: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %ymm8 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %ymm6 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm15 vmovdqu %ymm8, %ymm10 vmovdqu %ymm8, %ymm12 vmovdqu %ymm8, %ymm14 mix256faddintadd_loop: vaddps %ymm6, %ymm7, %ymm7 vpaddd %ymm8, %ymm8, %ymm8 vaddps %ymm6, %ymm9, %ymm9 vpaddd %ymm10, %ymm10, %ymm10 vaddps %ymm6, %ymm11, %ymm11 vpaddd %ymm12, %ymm12, %ymm12 vaddps %ymm6, %ymm13, %ymm13 vpaddd %ymm14, %ymm14, %ymm14 vaddps %ymm6, %ymm15, %ymm15 vpaddd %ymm5, %ymm5, %ymm5 vaddps %ymm6, %ymm7, %ymm7 vpaddd %ymm8, %ymm8, %ymm8 vaddps %ymm6, %ymm9, %ymm9 vpaddd %ymm10, %ymm10, %ymm10 vaddps %ymm6, %ymm11, %ymm11 vpaddd %ymm12, %ymm12, %ymm12 vaddps %ymm6, %ymm13, %ymm13 vpaddd %ymm14, %ymm14, %ymm14 vaddps %ymm6, %ymm15, %ymm15 vpaddd %ymm5, %ymm5, %ymm5 sub %r9, %rdi jnz mix256faddintadd_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mix256fp: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 vmovups %ymm6, %ymm5 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm12 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm14 vmovups %ymm6, %ymm15 mix256fp_loop: vaddps %ymm6, %ymm7, %ymm7 vmulps %ymm6, %ymm8, %ymm8 vaddps %ymm6, %ymm9, %ymm9 vmulps %ymm6, %ymm10, %ymm10 vaddps %ymm6, %ymm11, %ymm11 vmulps %ymm6, %ymm12, %ymm12 vaddps %ymm6, %ymm13, %ymm13 vmulps %ymm6, %ymm14, %ymm14 vaddps %ymm6, %ymm15, %ymm15 vmulps %ymm6, %ymm5, %ymm5 vaddps %ymm6, %ymm7, %ymm7 vmulps %ymm6, %ymm8, %ymm8 vaddps %ymm6, %ymm9, %ymm9 vmulps %ymm6, %ymm10, %ymm10 vaddps %ymm6, %ymm11, %ymm11 vmulps %ymm6, %ymm12, %ymm12 vaddps %ymm6, %ymm13, %ymm13 vmulps %ymm6, %ymm14, %ymm14 vaddps %ymm6, %ymm15, %ymm15 vmulps %ymm6, %ymm5, %ymm5 sub %r9, %rdi jnz mix256fp_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixadd256int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $30, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 mov %r9, %r15 mov %r9, %r14 mov %r9, %r13 mov %r9, %r12 mov %r9, %r11 mov %r9, %r8 mixadd256int_loop: add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 sub %r9, %rdi jnz mixadd256int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret mixadd256int11: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 mov %r9, %r15 mov %r9, %r14 mov %r9, %r13 mov %r9, %r12 mov %r9, %r11 mov %r9, %r8 mixadd256int11_loop: add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 add %r8, %r11 add %r8, %r12 add %r8, %r13 add %r8, %r14 add %r8, %r15 vpaddq %ymm0, %ymm1, %ymm1 vpaddq %ymm0, %ymm2, %ymm2 vpaddq %ymm0, %ymm3, %ymm3 vpaddq %ymm0, %ymm4, %ymm4 vpaddq %ymm0, %ymm5, %ymm5 sub %r9, %rdi jnz mixadd256int11_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latadd256int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 latadd256int_loop: vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 vpaddq %ymm0, %ymm0, %ymm0 sub %r9, %rdi jnz latadd256int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latadd512int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastq %xmm1, %zmm0 vmovdqa64 %zmm0, %zmm1 vmovdqa64 %zmm0, %zmm2 vmovdqa64 %zmm0, %zmm3 vmovdqa64 %zmm0, %zmm4 vmovdqa64 %zmm0, %zmm5 latadd51a2int_loop: vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 vpaddq %zmm0, %zmm0, %zmm0 sub %r9, %rdi jnz latadd256int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latmul512int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastd %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 latmul512int_loop: vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 vpmulld %zmm0, %zmm0, %zmm0 sub %r9, %rdi jnz latmul512int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latmuldq512int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastd %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 latmuldq512int_loop: vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 vpmuldq %zmm0, %zmm0, %zmm0 sub %r9, %rdi jnz latmuldq512int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latmulq512int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 vpbroadcastd %xmm1, %zmm0 vmovdqu64 %zmm0, %zmm1 vmovdqu64 %zmm0, %zmm2 vmovdqu64 %zmm0, %zmm3 vmovdqu64 %zmm0, %zmm4 vmovdqu64 %zmm0, %zmm5 latmulq512int_loop: vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 vpmullq %zmm0, %zmm0, %zmm0 sub %r9, %rdi jnz latmulq512int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latmul256int: push %r9 push %r8 push %r15 push %r14 push %r13 push %r12 push %r11 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastd %xmm1, %ymm0 vmovdqu %ymm0, %ymm1 vmovdqu %ymm0, %ymm2 vmovdqu %ymm0, %ymm3 vmovdqu %ymm0, %ymm4 vmovdqu %ymm0, %ymm5 latmul256int_loop: vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 vpmulld %ymm0, %ymm0, %ymm0 sub %r9, %rdi jnz latmul256int_loop movq %xmm1, %rax vzeroupper pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r8 pop %r9 ret latadd128int: push %r9 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastq %xmm1, %xmm0 latadd128int_loop: paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 paddq %xmm0, %xmm0 sub %r9, %rdi jnz latadd128int_loop movq %xmm1, %rax pop %r9 ret add128int: push %r9 mov $16, %r9 movq %r9, %xmm1 //vpbroadcastq %xmm1, %xmm0 add128int_loop: paddq %xmm0, %xmm0 paddq %xmm1, %xmm1 paddq %xmm2, %xmm2 paddq %xmm3, %xmm3 paddq %xmm4, %xmm4 paddq %xmm5, %xmm5 paddq %xmm6, %xmm6 paddq %xmm7, %xmm7 paddq %xmm0, %xmm0 paddq %xmm1, %xmm1 paddq %xmm2, %xmm2 paddq %xmm3, %xmm3 paddq %xmm4, %xmm4 paddq %xmm5, %xmm5 paddq %xmm6, %xmm6 paddq %xmm7, %xmm7 sub %r9, %rdi jg add128int_loop movq %xmm1, %rax pop %r9 ret aesenc128: push %r9 mov $20, %r9 movq %r9, %xmm1 vzeroall pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 aesenc128_loop: aesenc %xmm0, %xmm1 aesenc %xmm0, %xmm2 aesenc %xmm0, %xmm3 aesenc %xmm0, %xmm4 aesenc %xmm0, %xmm5 aesenc %xmm0, %xmm1 aesenc %xmm0, %xmm2 aesenc %xmm0, %xmm3 aesenc %xmm0, %xmm4 aesenc %xmm0, %xmm5 aesenc %xmm0, %xmm1 aesenc %xmm0, %xmm2 aesenc %xmm0, %xmm3 aesenc %xmm0, %xmm4 aesenc %xmm0, %xmm5 aesenc %xmm0, %xmm1 aesenc %xmm0, %xmm2 aesenc %xmm0, %xmm3 aesenc %xmm0, %xmm4 aesenc %xmm0, %xmm5 sub %r9, %rdi jnz aesenc128_loop movq %xmm1, %rax pop %r9 ret aesencadd128: push %r9 mov $20, %r9 movq %r9, %xmm1 vzeroall pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 pxor %xmm6, %xmm6 pxor %xmm7, %xmm7 pxor %xmm8, %xmm8 pxor %xmm9, %xmm9 pxor %xmm10, %xmm10 pxor %xmm11, %xmm11 pxor %xmm12, %xmm12 pxor %xmm13, %xmm13 aesencadd128_loop: aesenc %xmm0, %xmm1 paddd %xmm6, %xmm2 paddd %xmm6, %xmm3 paddd %xmm6, %xmm4 aesenc %xmm0, %xmm5 paddd %xmm6, %xmm7 paddd %xmm6, %xmm8 paddd %xmm6, %xmm9 aesenc %xmm0, %xmm10 paddd %xmm6, %xmm2 paddd %xmm6, %xmm3 paddd %xmm6, %xmm4 aesenc %xmm0, %xmm1 paddd %xmm6, %xmm7 paddd %xmm6, %xmm8 paddd %xmm6, %xmm9 aesenc %xmm0, %xmm10 paddd %xmm6, %xmm11 paddd %xmm6, %xmm12 paddd %xmm6, %xmm13 sub %r9, %rdi jnz aesencadd128_loop movq %xmm1, %rax pop %r9 ret aesencfma128: push %r9 mov $15, %r9 movq %r9, %xmm1 vzeroall pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 pxor %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 pxor %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 vxorps %xmm16, %xmm16, %xmm16 vxorps %xmm17, %xmm17, %xmm17 vxorps %xmm18, %xmm18, %xmm18 vxorps %xmm19, %xmm19, %xmm19 aesencfma128_loop: aesenc %xmm0, %xmm1 vfmadd132ps %xmm6, %xmm2, %xmm2 vfmadd132ps %xmm6, %xmm3, %xmm3 aesenc %xmm0, %xmm5 vfmadd132ps %xmm6, %xmm7, %xmm7 vfmadd132ps %xmm6, %xmm8, %xmm8 aesenc %xmm0, %xmm10 vfmadd132ps %xmm6, %xmm11, %xmm11 vfmadd132ps %xmm6, %xmm12, %xmm12 aesenc %xmm0, %xmm1 vfmadd132ps %xmm6, %xmm14, %xmm14 vfmadd132ps %xmm6, %xmm15, %xmm15 aesenc %xmm0, %xmm10 vfmadd132ps %xmm6, %xmm17, %xmm17 vfmadd132ps %xmm6, %xmm18, %xmm18 sub %r9, %rdi jnz aesencfma128_loop movq %xmm1, %rax pop %r9 ret aesencfadd128: push %r9 mov $15, %r9 movq %r9, %xmm1 vzeroall pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 pxor %xmm5, %xmm5 xorps %xmm6, %xmm6 xorps %xmm7, %xmm7 xorps %xmm8, %xmm8 xorps %xmm9, %xmm9 pxor %xmm10, %xmm10 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 xorps %xmm13, %xmm13 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 vxorps %xmm16, %xmm16, %xmm16 vxorps %xmm17, %xmm17, %xmm17 vxorps %xmm18, %xmm18, %xmm18 vxorps %xmm19, %xmm19, %xmm19 aesencfadd128_loop: aesenc %xmm0, %xmm1 vaddps %xmm6, %xmm2, %xmm2 vaddps %xmm6, %xmm3, %xmm3 aesenc %xmm0, %xmm5 vaddps %xmm6, %xmm7, %xmm7 vaddps %xmm6, %xmm8, %xmm8 aesenc %xmm0, %xmm10 vaddps %xmm6, %xmm11, %xmm11 vaddps %xmm6, %xmm12, %xmm12 aesenc %xmm0, %xmm1 vaddps %xmm6, %xmm14, %xmm14 vaddps %xmm6, %xmm15, %xmm15 aesenc %xmm0, %xmm10 vaddps %xmm6, %xmm17, %xmm17 vaddps %xmm6, %xmm18, %xmm18 sub %r9, %rdi jg aesencfadd128_loop movq %xmm1, %rax pop %r9 ret aesencmul128: push %r9 mov $15, %r9 vzeroall movq %r9, %xmm6 pxor %xmm0, %xmm0 pxor %xmm5, %xmm5 pxor %xmm10, %xmm10 xorps %xmm1, %xmm1 xorps %xmm2, %xmm2 xorps %xmm3, %xmm3 xorps %xmm4, %xmm4 xorps %xmm7, %xmm7 xorps %xmm8, %xmm8 xorps %xmm11, %xmm11 xorps %xmm12, %xmm12 xorps %xmm14, %xmm14 xorps %xmm15, %xmm15 aesencmul128_loop: aesenc %xmm0, %xmm1 pmullw %xmm6, %xmm2 pmullw %xmm6, %xmm3 aesenc %xmm0, %xmm5 pmullw %xmm6, %xmm7 pmullw %xmm6, %xmm8 aesenc %xmm0, %xmm10 pmullw %xmm6, %xmm11 pmullw %xmm6, %xmm12 aesenc %xmm0, %xmm1 pmullw %xmm6, %xmm4 pmullw %xmm6, %xmm6 aesenc %xmm0, %xmm10 pmullw %xmm6, %xmm13 pmullw %xmm6, %xmm14 sub %r9, %rdi jg aesencmul128_loop movq %xmm1, %rax pop %r9 ret aesdec128: push %r9 mov $20, %r9 movq %r9, %xmm1 vzeroall pxor %xmm0, %xmm0 pxor %xmm1, %xmm1 pxor %xmm2, %xmm2 pxor %xmm3, %xmm3 pxor %xmm4, %xmm4 pxor %xmm5, %xmm5 aesdec128_loop: aesdec %xmm0, %xmm1 aesdec %xmm0, %xmm2 aesdec %xmm0, %xmm3 aesdec %xmm0, %xmm4 aesdec %xmm0, %xmm5 aesdec %xmm0, %xmm1 aesdec %xmm0, %xmm2 aesdec %xmm0, %xmm3 aesdec %xmm0, %xmm4 aesdec %xmm0, %xmm5 aesdec %xmm0, %xmm1 aesdec %xmm0, %xmm2 aesdec %xmm0, %xmm3 aesdec %xmm0, %xmm4 aesdec %xmm0, %xmm5 aesdec %xmm0, %xmm1 aesdec %xmm0, %xmm2 aesdec %xmm0, %xmm3 aesdec %xmm0, %xmm4 aesdec %xmm0, %xmm5 sub %r9, %rdi jnz aesdec128_loop movq %xmm1, %rax pop %r9 ret mul128int: push %r9 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastd %xmm1, %xmm0 mul128int_loop: pmulld %xmm0, %xmm0 pmulld %xmm1, %xmm1 pmulld %xmm2, %xmm2 pmulld %xmm3, %xmm3 pmulld %xmm4, %xmm4 pmulld %xmm0, %xmm0 pmulld %xmm1, %xmm1 pmulld %xmm2, %xmm2 pmulld %xmm3, %xmm3 pmulld %xmm4, %xmm4 pmulld %xmm0, %xmm0 pmulld %xmm1, %xmm1 pmulld %xmm2, %xmm2 pmulld %xmm3, %xmm3 pmulld %xmm4, %xmm4 pmulld %xmm0, %xmm0 pmulld %xmm1, %xmm1 pmulld %xmm2, %xmm2 pmulld %xmm3, %xmm3 pmulld %xmm4, %xmm4 sub %r9, %rdi jnz mul128int_loop movq %xmm1, %rax pop %r9 ret latmul128int: push %r9 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastd %xmm1, %xmm0 latmul128int_loop: pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 pmulld %xmm0, %xmm0 sub %r9, %rdi jnz latmul128int_loop movq %xmm1, %rax pop %r9 ret mixaddmul128int: push %r9 mov $20, %r9 movq %r9, %xmm1 //vpbroadcastd %xmm1, %xmm0 movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm2 movdqa %xmm0, %xmm3 movdqa %xmm0, %xmm4 movdqa %xmm0, %xmm5 movdqa %xmm0, %xmm6 movdqa %xmm0, %xmm7 movdqa %xmm0, %xmm8 movdqa %xmm0, %xmm9 movdqa %xmm0, %xmm10 mixaddmul128int_loop: pmulld %xmm0, %xmm1 paddd %xmm0, %xmm2 pmulld %xmm0, %xmm3 paddd %xmm0, %xmm4 pmulld %xmm0, %xmm5 paddd %xmm0, %xmm6 pmulld %xmm0, %xmm7 paddd %xmm0, %xmm8 pmulld %xmm0, %xmm9 paddd %xmm0, %xmm10 pmulld %xmm0, %xmm1 paddd %xmm0, %xmm2 pmulld %xmm0, %xmm3 paddd %xmm0, %xmm4 pmulld %xmm0, %xmm5 paddd %xmm0, %xmm6 pmulld %xmm0, %xmm7 paddd %xmm0, %xmm8 pmulld %xmm0, %xmm9 paddd %xmm0, %xmm10 sub %r9, %rdi jnz mixaddmul128int_loop movq %xmm1, %rax pop %r9 ret latadd256fp: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 latadd256fp_loop: vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 vaddps %ymm6, %ymm6, %ymm6 sub %r9, %rdi jnz latadd256fp_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mul256fp: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm0 movups %xmm0, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 vmovdqa %ymm0, %ymm1 vmovdqa %ymm0, %ymm2 vmovdqa %ymm0, %ymm3 vmovdqa %ymm0, %ymm4 vmovdqa %ymm0, %ymm5 vmovdqa %ymm0, %ymm6 vmovdqa %ymm0, %ymm7 vmovdqa %ymm0, %ymm8 vmovdqa %ymm0, %ymm9 vmovdqa %ymm0, %ymm10 mul256fp_loop: vmulps %ymm0, %ymm1, %ymm1 vmulps %ymm0, %ymm2, %ymm2 vmulps %ymm0, %ymm3, %ymm3 vmulps %ymm0, %ymm4, %ymm4 vmulps %ymm0, %ymm5, %ymm5 vmulps %ymm0, %ymm6, %ymm6 vmulps %ymm0, %ymm7, %ymm7 vmulps %ymm0, %ymm8, %ymm8 vmulps %ymm0, %ymm9, %ymm9 vmulps %ymm0, %ymm10, %ymm10 vmulps %ymm0, %ymm1, %ymm1 vmulps %ymm0, %ymm2, %ymm2 vmulps %ymm0, %ymm3, %ymm3 vmulps %ymm0, %ymm4, %ymm4 vmulps %ymm0, %ymm5, %ymm5 vmulps %ymm0, %ymm6, %ymm6 vmulps %ymm0, %ymm7, %ymm7 vmulps %ymm0, %ymm8, %ymm8 vmulps %ymm0, %ymm9, %ymm9 vmulps %ymm0, %ymm10, %ymm10 sub %r9, %rdi jnz mul256fp_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret add256fp: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm0 movups %xmm0, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 vmovdqa %ymm0, %ymm1 vmovdqa %ymm0, %ymm2 vmovdqa %ymm0, %ymm3 vmovdqa %ymm0, %ymm4 vmovdqa %ymm0, %ymm5 vmovdqa %ymm0, %ymm6 vmovdqa %ymm0, %ymm7 vmovdqa %ymm0, %ymm8 vmovdqa %ymm0, %ymm9 vmovdqa %ymm0, %ymm10 add256fp_loop: vaddps %ymm0, %ymm1, %ymm1 vaddps %ymm0, %ymm2, %ymm2 vaddps %ymm0, %ymm3, %ymm3 vaddps %ymm0, %ymm4, %ymm4 vaddps %ymm0, %ymm5, %ymm5 vaddps %ymm0, %ymm6, %ymm6 vaddps %ymm0, %ymm7, %ymm7 vaddps %ymm0, %ymm8, %ymm8 vaddps %ymm0, %ymm9, %ymm9 vaddps %ymm0, %ymm10, %ymm10 vaddps %ymm0, %ymm1, %ymm1 vaddps %ymm0, %ymm2, %ymm2 vaddps %ymm0, %ymm3, %ymm3 vaddps %ymm0, %ymm4, %ymm4 vaddps %ymm0, %ymm5, %ymm5 vaddps %ymm0, %ymm6, %ymm6 vaddps %ymm0, %ymm7, %ymm7 vaddps %ymm0, %ymm8, %ymm8 vaddps %ymm0, %ymm9, %ymm9 vaddps %ymm0, %ymm10, %ymm10 sub %r9, %rdi jnz add256fp_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret latmul256fp: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 latmul256fp_loop: vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 vmulps %ymm6, %ymm6, %ymm6 sub %r9, %rdi jnz latmul256fp_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fma512: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %zmm6 vmovups %zmm6, %zmm5 vmovups %zmm6, %zmm7 vmovups %zmm6, %zmm8 vmovups %zmm6, %zmm9 vmovups %zmm6, %zmm10 vmovups %zmm6, %zmm11 vmovups %zmm6, %zmm12 vmovups %zmm6, %zmm13 vmovups %zmm6, %zmm14 vmovups %zmm6, %zmm15 fma512_loop: vfmadd132ps %zmm6, %zmm5, %zmm5 vfmadd132ps %zmm6, %zmm7, %zmm7 vfmadd132ps %zmm6, %zmm8, %zmm8 vfmadd132ps %zmm6, %zmm9, %zmm9 vfmadd132ps %zmm6, %zmm10, %zmm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %zmm6, %zmm12, %zmm12 vfmadd132ps %zmm6, %zmm13, %zmm13 vfmadd132ps %zmm6, %zmm14, %zmm14 vfmadd132ps %zmm6, %zmm15, %zmm15 vfmadd132ps %zmm6, %zmm5, %zmm5 vfmadd132ps %zmm6, %zmm7, %zmm7 vfmadd132ps %zmm6, %zmm8, %zmm8 vfmadd132ps %zmm6, %zmm9, %zmm9 vfmadd132ps %zmm6, %zmm10, %zmm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %zmm6, %zmm12, %zmm12 vfmadd132ps %zmm6, %zmm13, %zmm13 vfmadd132ps %zmm6, %zmm14, %zmm14 vfmadd132ps %zmm6, %zmm15, %zmm15 sub %r9, %rdi jnz fma512_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mix21fma256fma512: push %r9 push %r8 mov $18, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %zmm6 vmovups %zmm6, %zmm5 vmovups %zmm6, %zmm7 vmovups %zmm6, %zmm8 vmovups %zmm6, %zmm9 vmovups %zmm6, %zmm10 vmovups %zmm6, %zmm11 vmovups %zmm6, %zmm12 vmovups %zmm6, %zmm13 vmovups %zmm6, %zmm14 vmovups %zmm6, %zmm15 mix21fma256fma512_loop: vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vfmadd132ps %zmm6, %zmm8, %zmm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %ymm6, %ymm13, %ymm13 vfmadd132ps %zmm6, %zmm14, %zmm14 vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vfmadd132ps %zmm6, %zmm8, %zmm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %ymm6, %ymm13, %ymm13 vfmadd132ps %zmm6, %zmm14, %zmm14 sub %r9, %rdi jg mix21fma256fma512_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfma256fma512: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %zmm6 vmovups %zmm6, %zmm5 vmovups %zmm6, %zmm7 vmovups %zmm6, %zmm8 vmovups %zmm6, %zmm9 vmovups %zmm6, %zmm10 vmovups %zmm6, %zmm11 vmovups %zmm6, %zmm12 vmovups %zmm6, %zmm13 vmovups %zmm6, %zmm14 vmovups %zmm6, %zmm15 mixfma256fma512_loop: vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %zmm6, %zmm7, %zmm7 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %zmm6, %zmm9, %zmm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %zmm6, %zmm13, %zmm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %zmm6, %zmm15, %zmm15 vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %zmm6, %zmm7, %zmm7 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %zmm6, %zmm9, %zmm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %zmm6, %zmm11, %zmm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %zmm6, %zmm13, %zmm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %zmm6, %zmm15, %zmm15 sub %r9, %rdi jnz mixfma256fma512_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fma256: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 vmovups %ymm6, %ymm5 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm12 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm14 vmovups %ymm6, %ymm15 fma256_loop: vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %ymm6, %ymm11, %ymm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %ymm6, %ymm13, %ymm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %ymm6, %ymm15, %ymm15 vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vfmadd132ps %ymm6, %ymm10, %ymm10 vfmadd132ps %ymm6, %ymm11, %ymm11 vfmadd132ps %ymm6, %ymm12, %ymm12 vfmadd132ps %ymm6, %ymm13, %ymm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %ymm6, %ymm15, %ymm15 sub %r9, %rdi jnz fma256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fma128: push %r9 push %r8 vzeroupper mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -16(%rsp) vbroadcastss -16(%rsp), %xmm6 vmovups %xmm6, %xmm5 vmovups %xmm6, %xmm7 vmovups %xmm6, %xmm8 vmovups %xmm6, %xmm9 vmovups %xmm6, %xmm10 vmovups %xmm6, %xmm11 vmovups %xmm6, %xmm12 vmovups %xmm6, %xmm13 vmovups %xmm6, %xmm14 vmovups %xmm6, %xmm15 fma128_loop: vfmadd132ps %xmm6, %xmm5, %xmm5 vfmadd132ps %xmm6, %xmm7, %xmm7 vfmadd132ps %xmm6, %xmm8, %xmm8 vfmadd132ps %xmm6, %xmm9, %xmm9 vfmadd132ps %xmm6, %xmm10, %xmm10 vfmadd132ps %xmm6, %xmm11, %xmm11 vfmadd132ps %xmm6, %xmm12, %xmm12 vfmadd132ps %xmm6, %xmm13, %xmm13 vfmadd132ps %xmm6, %xmm14, %xmm14 vfmadd132ps %xmm6, %xmm15, %xmm15 vfmadd132ps %xmm6, %xmm5, %xmm5 vfmadd132ps %xmm6, %xmm7, %xmm7 vfmadd132ps %xmm6, %xmm8, %xmm8 vfmadd132ps %xmm6, %xmm9, %xmm9 vfmadd132ps %xmm6, %xmm10, %xmm10 vfmadd132ps %xmm6, %xmm11, %xmm11 vfmadd132ps %xmm6, %xmm12, %xmm12 vfmadd132ps %xmm6, %xmm13, %xmm13 vfmadd132ps %xmm6, %xmm14, %xmm14 vfmadd132ps %xmm6, %xmm15, %xmm15 sub %r9, %rdi jnz fma128_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmafadd256: push %r9 push %r8 mov $30, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %ymm6 vmovups %ymm6, %ymm0 vmovups %ymm6, %ymm1 vmovups %ymm6, %ymm2 vmovups %ymm6, %ymm3 vmovups %ymm6, %ymm4 vmovups %ymm6, %ymm5 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm12 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm14 vmovups %ymm6, %ymm15 mixfmafadd256_loop: vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vaddps %ymm10, %ymm5, %ymm11 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vaddps %ymm12, %ymm5, %ymm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %ymm6, %ymm15, %ymm15 vaddps %ymm12, %ymm6, %ymm13 vfmadd132ps %ymm6, %ymm0, %ymm1 vfmadd132ps %ymm6, %ymm2, %ymm3 vaddps %ymm6, %ymm5, %ymm4 vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vaddps %ymm10, %ymm6, %ymm11 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vaddps %ymm12, %ymm7, %ymm13 vfmadd132ps %ymm6, %ymm14, %ymm14 vfmadd132ps %ymm6, %ymm15, %ymm15 vaddps %ymm12, %ymm5, %ymm13 vfmadd132ps %ymm6, %ymm0, %ymm1 vfmadd132ps %ymm6, %ymm2, %ymm3 vaddps %ymm6, %ymm5, %ymm4 vfmadd132ps %ymm6, %ymm5, %ymm5 vfmadd132ps %ymm6, %ymm7, %ymm7 vaddps %ymm10, %ymm6, %ymm11 vfmadd132ps %ymm6, %ymm8, %ymm8 vfmadd132ps %ymm6, %ymm9, %ymm9 vaddps %ymm12, %ymm5, %ymm13 sub %r9, %rdi jnz mixfmafadd256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmaadd512: push %r9 push %r8 mov $16, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %zmm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %zmm1 vmovdqa64 %zmm0, %zmm3 vmovdqa64 %zmm0, %zmm6 vmovdqa64 %zmm0, %zmm9 vmovdqa64 %zmm0, %zmm12 vmovdqa64 %zmm0, %zmm15 vmovaps %zmm1, %zmm2 vmovaps %zmm1, %zmm4 vmovaps %zmm1, %zmm5 vmovaps %zmm1, %zmm7 vmovaps %zmm1, %zmm8 vmovaps %zmm1, %zmm10 vmovaps %zmm1, %zmm11 vmovaps %zmm1, %zmm13 vmovaps %zmm1, %zmm14 mixfmaadd512_loop: vpaddq %zmm0, %zmm15, %zmm0 vfmadd132ps %zmm1, %zmm1, %zmm1 vfmadd132ps %zmm2, %zmm2, %zmm2 vpaddq %zmm3, %zmm15, %zmm3 vfmadd132ps %zmm4, %zmm4, %zmm4 vfmadd132ps %zmm5, %zmm5, %zmm5 vpaddq %zmm6, %zmm15, %zmm6 vfmadd132ps %zmm7, %zmm7, %zmm7 vfmadd132ps %zmm8, %zmm8, %zmm8 vpaddq %zmm9, %zmm15, %zmm9 vfmadd132ps %zmm10, %zmm10, %zmm10 vfmadd132ps %zmm11, %zmm11, %zmm11 vpaddq %zmm12, %zmm15, %zmm12 vfmadd132ps %zmm13, %zmm13, %zmm13 vfmadd132ps %zmm14, %zmm14, %zmm14 sub %r9, %rdi jg mixfmaadd512_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfma512add256: push %r9 push %r8 mov $16, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %ymm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %zmm1 vmovdqa %ymm0, %ymm3 vmovdqa %ymm0, %ymm6 vmovdqa %ymm0, %ymm9 vmovdqa %ymm0, %ymm12 vmovdqa %ymm0, %ymm15 vmovaps %zmm1, %zmm2 vmovaps %zmm1, %zmm4 vmovaps %zmm1, %zmm5 vmovaps %zmm1, %zmm7 vmovaps %zmm1, %zmm8 vmovaps %zmm1, %zmm10 vmovaps %zmm1, %zmm11 vmovaps %zmm1, %zmm13 vmovaps %zmm1, %zmm14 mixfma512add256_loop: vpaddq %ymm0, %ymm15, %ymm0 vfmadd132ps %zmm1, %zmm1, %zmm1 vfmadd132ps %zmm2, %zmm2, %zmm2 vpaddq %ymm3, %ymm15, %ymm3 vfmadd132ps %zmm4, %zmm4, %zmm4 vfmadd132ps %zmm5, %zmm5, %zmm5 vpaddq %ymm6, %ymm15, %ymm6 vfmadd132ps %zmm7, %zmm7, %zmm7 vfmadd132ps %zmm8, %zmm8, %zmm8 vpaddq %ymm9, %ymm15, %ymm9 vfmadd132ps %zmm10, %zmm10, %zmm10 vfmadd132ps %zmm11, %zmm11, %zmm11 vpaddq %ymm12, %ymm15, %ymm12 vfmadd132ps %zmm13, %zmm13, %zmm13 vfmadd132ps %zmm14, %zmm14, %zmm14 sub %r9, %rdi jg mixfma512add256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmaadd256: push %r9 push %r8 mov $16, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %ymm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %ymm1 vmovdqa %ymm0, %ymm3 vmovdqa %ymm0, %ymm6 vmovdqa %ymm0, %ymm9 vmovdqa %ymm0, %ymm12 vmovdqa %ymm0, %ymm15 vmovaps %ymm1, %ymm2 vmovaps %ymm1, %ymm4 vmovaps %ymm1, %ymm5 vmovaps %ymm1, %ymm7 vmovaps %ymm1, %ymm8 vmovaps %ymm1, %ymm10 vmovaps %ymm1, %ymm11 vmovaps %ymm1, %ymm13 vmovaps %ymm1, %ymm14 mixfmaadd256_loop: vpaddq %ymm0, %ymm15, %ymm0 vfmadd132ps %ymm1, %ymm1, %ymm1 vfmadd132ps %ymm2, %ymm2, %ymm2 vpaddq %ymm3, %ymm15, %ymm3 vfmadd132ps %ymm4, %ymm4, %ymm4 vfmadd132ps %ymm5, %ymm5, %ymm5 vpaddq %ymm6, %ymm15, %ymm6 vfmadd132ps %ymm7, %ymm7, %ymm7 vfmadd132ps %ymm8, %ymm8, %ymm8 vpaddq %ymm9, %ymm15, %ymm9 vfmadd132ps %ymm10, %ymm10, %ymm10 vfmadd132ps %ymm11, %ymm11, %ymm11 vpaddq %ymm12, %ymm15, %ymm12 vfmadd132ps %ymm13, %ymm13, %ymm13 vfmadd132ps %ymm14, %ymm14, %ymm14 sub %r9, %rdi jg mixfmaadd256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmaand256: push %r9 push %r8 mov $15, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %ymm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %ymm1 vmovdqa %ymm0, %ymm3 vmovdqa %ymm0, %ymm6 vmovdqa %ymm0, %ymm9 vmovdqa %ymm0, %ymm12 vmovdqa %ymm0, %ymm15 vmovaps %ymm1, %ymm2 vmovaps %ymm1, %ymm4 vmovaps %ymm1, %ymm5 vmovaps %ymm1, %ymm7 vmovaps %ymm1, %ymm8 vmovaps %ymm1, %ymm10 vmovaps %ymm1, %ymm11 vmovaps %ymm1, %ymm13 vmovaps %ymm1, %ymm14 mixfmaand256_loop: vpand %ymm0, %ymm15, %ymm0 vfmadd132ps %ymm1, %ymm1, %ymm1 vfmadd132ps %ymm2, %ymm2, %ymm2 vpand %ymm3, %ymm15, %ymm3 vfmadd132ps %ymm4, %ymm4, %ymm4 vfmadd132ps %ymm5, %ymm5, %ymm5 vpand %ymm6, %ymm15, %ymm6 vfmadd132ps %ymm7, %ymm7, %ymm7 vfmadd132ps %ymm8, %ymm8, %ymm8 vpand %ymm9, %ymm15, %ymm9 vfmadd132ps %ymm10, %ymm10, %ymm10 vfmadd132ps %ymm11, %ymm11, %ymm11 vpand %ymm12, %ymm15, %ymm12 vfmadd132ps %ymm13, %ymm13, %ymm13 vfmadd132ps %ymm14, %ymm14, %ymm14 sub %r9, %rdi jg mixfmaand256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmaandmem256: push %r9 push %r8 mov $22, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %ymm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %ymm1 vmovdqa %ymm0, %ymm3 vmovaps %ymm1, %ymm6 vmovaps %ymm1, %ymm9 vmovaps %ymm1, %ymm12 vmovaps %ymm1, %ymm15 vmovaps %ymm1, %ymm2 vmovaps %ymm1, %ymm4 vmovaps %ymm1, %ymm5 vmovaps %ymm1, %ymm7 vmovaps %ymm1, %ymm8 vmovaps %ymm1, %ymm10 vmovaps %ymm1, %ymm11 vmovaps %ymm1, %ymm13 vmovaps %ymm1, %ymm14 mixfmaandmem256_loop: vpand %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm1, %ymm1, %ymm1 vfmadd132ps (%rsi), %ymm2, %ymm2 vpand %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm4, %ymm4, %ymm4 vfmadd132ps (%rsi), %ymm5, %ymm5 vpand %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm7, %ymm7, %ymm7 vfmadd132ps (%rsi), %ymm8, %ymm8 vpand %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm10, %ymm10, %ymm10 vfmadd132ps (%rsi), %ymm11, %ymm11 vpand %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm13, %ymm13, %ymm13 vfmadd132ps (%rsi), %ymm14, %ymm14 vpand %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm6, %ymm6, %ymm6 vfmadd132ps (%rsi), %ymm9, %ymm9 vpand %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm12, %ymm12, %ymm12 vfmadd132ps (%rsi), %ymm15, %ymm15 sub %r9, %rdi jg mixfmaandmem256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret mixfmaaddmem256: push %r9 push %r8 mov $22, %r9 movq %r9, %xmm0 vpbroadcastq %xmm0, %ymm0 cvtsi2ss %r9, %xmm1 vbroadcastss %xmm1, %ymm1 vmovdqa %ymm0, %ymm3 vmovaps %ymm1, %ymm6 vmovaps %ymm1, %ymm9 vmovaps %ymm1, %ymm12 vmovaps %ymm1, %ymm15 vmovaps %ymm1, %ymm2 vmovaps %ymm1, %ymm4 vmovaps %ymm1, %ymm5 vmovaps %ymm1, %ymm7 vmovaps %ymm1, %ymm8 vmovaps %ymm1, %ymm10 vmovaps %ymm1, %ymm11 vmovaps %ymm1, %ymm13 vmovaps %ymm1, %ymm14 mixfmaaddmem256_loop: vpaddq %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm1, %ymm1, %ymm1 vfmadd132ps (%rsi), %ymm2, %ymm2 vpaddq %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm4, %ymm4, %ymm4 vfmadd132ps (%rsi), %ymm5, %ymm5 vpaddq %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm7, %ymm7, %ymm7 vfmadd132ps (%rsi), %ymm8, %ymm8 vpaddq %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm10, %ymm10, %ymm10 vfmadd132ps (%rsi), %ymm11, %ymm11 vpaddq %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm13, %ymm13, %ymm13 vfmadd132ps (%rsi), %ymm14, %ymm14 vpaddq %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm6, %ymm6, %ymm6 vfmadd132ps (%rsi), %ymm9, %ymm9 vpaddq %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm12, %ymm12, %ymm12 vfmadd132ps (%rsi), %ymm15, %ymm15 sub %r9, %rdi jg mixfmaaddmem256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret nemesfpu512mix21: push %r9 mov $16, %r9 cvtsi2ss %r9, %xmm0 vbroadcastss %xmm0, %zmm1 vmovdqa64 %zmm1, %zmm2 vmovdqa64 %zmm1, %zmm3 vmovdqa64 %zmm1, %zmm4 vmovdqa64 %zmm1, %zmm5 vmovdqa64 %zmm1, %zmm6 vmovdqa64 %zmm1, %zmm7 vmovdqa64 %zmm1, %zmm8 vmovdqa64 %zmm1, %zmm9 vmovdqa64 %zmm1, %zmm10 vmovdqa64 %zmm1, %zmm11 vmovdqa64 %zmm1, %zmm12 vmovdqa64 %zmm1, %zmm13 vmovdqa64 %zmm1, %zmm14 vmovdqa64 %zmm1, %zmm15 nemesfpu512mix21_loop: vaddps %zmm0, %zmm0, %zmm0 vfmadd132ps %zmm1, %zmm1, %zmm1 vfmadd132ps %zmm2, %zmm2, %zmm2 vaddps %zmm3, %zmm3, %zmm3 vfmadd132ps %zmm4, %zmm4, %zmm4 vfmadd132ps %zmm5, %zmm5, %zmm5 vaddps %zmm6, %zmm6, %zmm6 vfmadd132ps %zmm7, %zmm7, %zmm7 vfmadd132ps %zmm8, %zmm8, %zmm8 vaddps %zmm9, %zmm9, %zmm9 vfmadd132ps %zmm10, %zmm10, %zmm10 vfmadd132ps %zmm11, %zmm11, %zmm11 vaddps %ymm12, %ymm12, %ymm12 vfmadd132ps %zmm13, %zmm13, %zmm13 vfmadd132ps %zmm14, %zmm14, %zmm14 vaddps %zmm15, %zmm15, %zmm15 sub %r9, %rdi jg nemesfpu512mix21_loop pop %r9 ret nemesfpumix21: push %r9 mov $16, %r9 cvtsi2ss %r9, %xmm0 vbroadcastss %xmm0, %ymm1 vmovdqa %ymm1, %ymm2 vmovdqa %ymm1, %ymm3 vmovdqa %ymm1, %ymm4 vmovdqa %ymm1, %ymm5 vmovdqa %ymm1, %ymm6 vmovdqa %ymm1, %ymm7 vmovdqa %ymm1, %ymm8 vmovdqa %ymm1, %ymm9 vmovdqa %ymm1, %ymm10 vmovdqa %ymm1, %ymm11 vmovdqa %ymm1, %ymm12 vmovdqa %ymm1, %ymm13 vmovdqa %ymm1, %ymm14 vmovdqa %ymm1, %ymm15 nemesfpumix21_loop: vaddps %ymm0, %ymm0, %ymm0 vfmadd132ps %ymm1, %ymm1, %ymm1 vfmadd132ps %ymm2, %ymm2, %ymm2 vaddps %ymm3, %ymm3, %ymm3 vfmadd132ps %ymm4, %ymm4, %ymm4 vfmadd132ps %ymm5, %ymm5, %ymm5 vaddps %ymm6, %ymm6, %ymm6 vfmadd132ps %ymm7, %ymm7, %ymm7 vfmadd132ps %ymm8, %ymm8, %ymm8 vaddps %ymm9, %ymm9, %ymm9 vfmadd132ps %ymm10, %ymm10, %ymm10 vfmadd132ps %ymm11, %ymm11, %ymm11 vaddps %ymm12, %ymm12, %ymm12 vfmadd132ps %ymm13, %ymm13, %ymm13 vfmadd132ps %ymm14, %ymm14, %ymm14 vaddps %ymm15, %ymm15, %ymm15 sub %r9, %rdi jg nemesfpumix21_loop pop %r9 ret latfma512: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %zmm6 vmovups %zmm6, %zmm5 vmovups %zmm6, %zmm7 vmovups %zmm6, %zmm8 vmovups %zmm6, %zmm9 vmovups %zmm6, %zmm10 vmovups %zmm6, %zmm11 vmovups %zmm6, %zmm12 vmovups %zmm6, %zmm13 vmovups %zmm6, %zmm14 vmovups %zmm6, %zmm15 latfma512_loop: vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 vfmadd132ps %zmm6, %zmm5, %zmm7 sub %r9, %rdi jnz latfma512_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret latfma256: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %ymm6 vmovups %ymm6, %ymm5 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm12 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm14 vmovups %ymm6, %ymm15 latfma256_loop: vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 vfmadd132ps %ymm6, %ymm5, %ymm7 sub %r9, %rdi jnz latfma256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret latfma128: push %r9 push %r8 vzeroupper mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 vbroadcastss %xmm6, %xmm6 vmovups %xmm6, %xmm5 vmovups %xmm6, %xmm7 vmovups %xmm6, %xmm8 vmovups %xmm6, %xmm9 vmovups %xmm6, %xmm10 vmovups %xmm6, %xmm11 vmovups %xmm6, %xmm12 vmovups %xmm6, %xmm13 vmovups %xmm6, %xmm14 vmovups %xmm6, %xmm15 latfma128_loop: vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 vfmadd132ps %xmm6, %xmm5, %xmm7 sub %r9, %rdi jnz latfma128_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret latadd128fp: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 //vbroadcastss %xmm6, %xmm6 latadd128fp_loop: addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 addps %xmm6, %xmm6 sub %r9, %rdi jnz latadd128fp_loop movq %xmm1, %rax pop %r8 pop %r9 ret latmul128fp: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 //vbroadcastss %xmm6, %xmm6 latmul128fp_loop: mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 mulps %xmm6, %xmm6 sub %r9, %rdi jnz latmul128fp_loop movq %xmm1, %rax pop %r8 pop %r9 ret mul128fp: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm4 cvtsi2ss %r9, %xmm3 cvtsi2ss %r9, %xmm2 cvtsi2ss %r9, %xmm1 cvtsi2ss %r9, %xmm0 mul128fp_loop: mulps %xmm0, %xmm0 mulps %xmm1, %xmm1 mulps %xmm2, %xmm2 mulps %xmm3, %xmm3 mulps %xmm4, %xmm4 mulps %xmm0, %xmm0 mulps %xmm1, %xmm1 mulps %xmm2, %xmm2 mulps %xmm3, %xmm3 mulps %xmm4, %xmm4 mulps %xmm0, %xmm0 mulps %xmm1, %xmm1 mulps %xmm2, %xmm2 mulps %xmm3, %xmm3 mulps %xmm4, %xmm4 mulps %xmm0, %xmm0 mulps %xmm1, %xmm1 mulps %xmm2, %xmm2 mulps %xmm3, %xmm3 mulps %xmm4, %xmm4 sub %r9, %rdi jnz mul128fp_loop movq %xmm1, %rax pop %r8 pop %r9 ret add128fp: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm4 cvtsi2ss %r9, %xmm3 cvtsi2ss %r9, %xmm2 cvtsi2ss %r9, %xmm1 cvtsi2ss %r9, %xmm0 add128fp_loop: addps %xmm0, %xmm0 addps %xmm1, %xmm1 addps %xmm2, %xmm2 addps %xmm3, %xmm3 addps %xmm4, %xmm4 addps %xmm0, %xmm0 addps %xmm1, %xmm1 addps %xmm2, %xmm2 addps %xmm3, %xmm3 addps %xmm4, %xmm4 addps %xmm0, %xmm0 addps %xmm1, %xmm1 addps %xmm2, %xmm2 addps %xmm3, %xmm3 addps %xmm4, %xmm4 addps %xmm0, %xmm0 addps %xmm1, %xmm1 addps %xmm2, %xmm2 addps %xmm3, %xmm3 addps %xmm4, %xmm4 sub %r9, %rdi jnz add128fp_loop movq %xmm1, %rax pop %r8 pop %r9 ret latmul64: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 latmul64_loop: imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 imul %r9, %r15 sub %r9, %rdi jnz latmul64_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret latmul16: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 latmul16_loop: imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w imul %r9w, %r15w sub %r9, %rdi jnz latmul16_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret mul16: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 mul16_loop: imul %r9w, %r15w imul %r9w, %r14w imul %r9w, %r13w imul %r9w, %r12w imul %r9w, %r11w imul %r9w, %r15w imul %r9w, %r14w imul %r9w, %r13w imul %r9w, %r12w imul %r9w, %r11w imul %r9w, %r15w imul %r9w, %r14w imul %r9w, %r13w imul %r9w, %r12w imul %r9w, %r11w imul %r9w, %r15w imul %r9w, %r14w imul %r9w, %r13w imul %r9w, %r12w imul %r9w, %r11w sub %r9, %rdi jnz mul16_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret mul64: push %rbx push %rcx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 mul64_loop: imul %r9, %r15 mov %r9, %r15 imul %r9, %r14 mov %r9, %r14 imul %r9, %r13 mov %r9, %r13 imul %r9, %r12 mov %r9, %r12 imul %r9, %r11 mov %r9, %r11 imul %r9, %r10 mov %r9, %r10 imul %r9, %r8 mov %r9, %r8 imul %r9, %rbx mov %r9, %rbx imul %r9, %rcx mov %r9, %rcx imul %r9, %rsi mov %r9, %rsi imul %r9, %r15 mov %r9, %r15 imul %r9, %r14 mov %r9, %r14 imul %r9, %r13 mov %r9, %r13 imul %r9, %r12 mov %r9, %r12 imul %r9, %r11 mov %r9, %r11 imul %r9, %r10 mov %r9, %r10 imul %r9, %r8 mov %r9, %r8 imul %r9, %rbx mov %r9, %rbx imul %r9, %rcx mov %r9, %rcx imul %r9, %rsi mov %r9, %rsi sub %r9, %rdi jnz mul64_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rcx pop %rbx ret mixmul16mul64: push %rbx push %rcx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 mixmul16mul64_loop: imul %r9, %r15 imul %r9w, %r14w imul %r9, %r13 imul %r9w, %r12w imul %r9, %r11 imul %r9w, %r10w imul %r9, %r8 imul %r9w, %bx imul %r9, %rcx imul %r9w, %si imul %r9, %r15 imul %r9w, %r14w imul %r9, %r13 imul %r9w, %r12w imul %r9, %r11 imul %r9w, %r10w imul %r9, %r8 imul %r9w, %bx imul %r9, %rcx imul %r9w, %si sub %r9, %rdi jnz mixmul16mul64_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rcx pop %rbx ret mixmul16mul64_21: push %rbx push %rcx push %rdx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $24, %r9 mov %r8, %rbx mov %r8, %rcx mov %r8, %rsi mov %r8, %r10 mov %r8, %r11 mov %r8, %r12 mov %r8, %r13 mov %r8, %r14 mov %r9, %r15 mixmul16mul64_21_loop: imul %r9, %r15 imul %r9w, %r14w imul %r9w, %r13w imul %r9, %r12 imul %r9w, %r11w imul %r9w, %r10w imul %r9, %r8 imul %r9w, %r14w imul %r9w, %r13w imul %r9, %rcx imul %r9w, %r11w imul %r9w, %r10w imul %r9, %rbx imul %r9w, %r14w imul %r9w, %r13w imul %r9, %rax imul %r9w, %r11w imul %r9w, %r10w imul %r9, %rsi imul %r9w, %r14w imul %r9w, %r13w imul %r9, %rdx imul %r9w, %r11w imul %r9w, %r10w sub %r9, %rdi jge mixmul16mul64_21_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rdx pop %rcx pop %rbx ret loadscalar: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $20, %r9 loadscalar_loop: mov (%rsi), %r15 mov 8(%rsi), %r14 mov 16(%rsi), %r13 mov 24(%rsi), %r12 mov 32(%rsi), %r11 mov 40(%rsi), %r10 mov 48(%rsi), %r15 mov 56(%rsi), %r14 mov 64(%rsi), %r13 mov 72(%rsi), %r12 mov 80(%rsi), %r11 mov 88(%rsi), %r10 mov 96(%rsi), %r15 mov 104(%rsi), %r14 mov 112(%rsi), %r13 mov 120(%rsi), %r12 mov 128(%rsi), %r11 mov 136(%rsi), %r10 mov 144(%rsi), %r15 mov 152(%rsi), %r14 sub %r9, %rdi jnz loadscalar_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret spacedstorescalar: push %rbx push %rcx push %r8 push %r9 mov $20, %r9 spacedstorescalar_loop: mov %rdi, (%rsi) mov %rdi, 64(%rsi) mov %rdi, 128(%rsi) mov %rdi, 192(%rsi) mov %rdi, 256(%rsi) mov %rdi, 320(%rsi) mov %rdi, 384(%rsi) mov %rdi, 448(%rsi) mov %rdi, 512(%rsi) mov %rdi, 576(%rsi) mov %rdi, 640(%rsi) mov %rdi, 704(%rsi) mov %rdi, 768(%rsi) mov %rdi, 832(%rsi) mov %rdi, 896(%rsi) mov %rdi, 960(%rsi) mov %rdi, 1024(%rsi) mov %rdi, 1088(%rsi) mov %rdi, 1152(%rsi) mov %rdi, 1216(%rsi) sub %r9, %rdi jnz spacedstorescalar_loop pop %r9 pop %r8 pop %rcx pop %rbx ret mixedscalarloadstore: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $12, %r9 mixedscalarloadstore_loop: mov (%rsi), %r15 mov 8(%rsi), %r14 mov %r9, 400(%rsi) mov 16(%rsi), %r13 mov 24(%rsi), %r12 mov %r9, 408(%rsi) mov 32(%rsi), %r11 mov 40(%rsi), %r10 mov %r9, 416(%rsi) mov 48(%rsi), %r15 mov 56(%rsi), %r14 mov %r9, 424(%rsi) sub %r9, %rdi jg mixedscalarloadstore_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret spacedload128: push %rbx push %rcx push %r8 push %r9 mov $20, %r9 spacedload128_loop: movdqa (%rsi), %xmm10 movdqa 64(%rsi), %xmm11 movdqa 128(%rsi), %xmm12 movdqa 192(%rsi), %xmm13 movdqa 256(%rsi), %xmm14 movdqa 320(%rsi), %xmm10 movdqa 384(%rsi), %xmm11 movdqa 448(%rsi), %xmm12 movdqa 512(%rsi), %xmm13 movdqa 576(%rsi), %xmm14 movdqa 640(%rsi), %xmm10 movdqa 704(%rsi), %xmm11 movdqa 768(%rsi), %xmm12 movdqa 832(%rsi), %xmm13 movdqa 896(%rsi), %xmm14 movdqa 960(%rsi), %xmm10 movdqa 1024(%rsi), %xmm11 movdqa 1088(%rsi), %xmm12 movdqa 1152(%rsi), %xmm13 movdqa 1216(%rsi), %xmm14 sub %r9, %rdi jnz spacedload128_loop pop %r9 pop %r8 pop %rcx pop %rbx ret load128: push %rbx push %rcx push %r8 push %r9 mov $20, %r9 load128_loop: movdqa (%rsi), %xmm10 movdqa (%rsi), %xmm11 movdqa (%rsi), %xmm12 movdqa (%rsi), %xmm13 movdqa (%rsi), %xmm14 movdqa (%rsi), %xmm10 movdqa (%rsi), %xmm11 movdqa (%rsi), %xmm12 movdqa (%rsi), %xmm13 movdqa (%rsi), %xmm14 movdqa (%rsi), %xmm10 movdqa (%rsi), %xmm11 movdqa (%rsi), %xmm12 movdqa (%rsi), %xmm13 movdqa (%rsi), %xmm14 movdqa (%rsi), %xmm10 movdqa (%rsi), %xmm11 movdqa (%rsi), %xmm12 movdqa (%rsi), %xmm13 movdqa (%rsi), %xmm14 sub %r9, %rdi jnz load128_loop pop %r9 pop %r8 pop %rcx pop %rbx ret load256: push %rbx push %rcx push %r8 push %r9 mov $20, %r9 load256_loop: vmovaps (%rsi), %ymm10 vmovaps (%rsi), %ymm11 vmovaps (%rsi), %ymm12 vmovaps (%rsi), %ymm13 vmovaps (%rsi), %ymm14 vmovaps (%rsi), %ymm10 vmovaps (%rsi), %ymm11 vmovaps (%rsi), %ymm12 vmovaps (%rsi), %ymm13 vmovaps (%rsi), %ymm14 vmovaps (%rsi), %ymm10 vmovaps (%rsi), %ymm11 vmovaps (%rsi), %ymm12 vmovaps (%rsi), %ymm13 vmovaps (%rsi), %ymm14 vmovaps (%rsi), %ymm10 vmovaps (%rsi), %ymm11 vmovaps (%rsi), %ymm12 vmovaps (%rsi), %ymm13 vmovaps (%rsi), %ymm14 sub %r9, %rdi jnz load256_loop pop %r9 pop %r8 pop %rcx pop %rbx ret load512: push %rbx push %rcx push %r8 push %r9 mov $20, %r9 load512_loop: vmovaps (%rsi), %zmm10 vmovaps (%rsi), %zmm11 vmovaps (%rsi), %zmm12 vmovaps (%rsi), %zmm13 vmovaps (%rsi), %zmm14 vmovaps (%rsi), %zmm10 vmovaps (%rsi), %zmm11 vmovaps (%rsi), %zmm12 vmovaps (%rsi), %zmm13 vmovaps (%rsi), %zmm14 vmovaps (%rsi), %zmm10 vmovaps (%rsi), %zmm11 vmovaps (%rsi), %zmm12 vmovaps (%rsi), %zmm13 vmovaps (%rsi), %zmm14 vmovaps (%rsi), %zmm10 vmovaps (%rsi), %zmm11 vmovaps (%rsi), %zmm12 vmovaps (%rsi), %zmm13 vmovaps (%rsi), %zmm14 sub %r9, %rdi jnz load512_loop pop %r9 pop %r8 pop %rcx pop %rbx ret store128: push %rbx push %rcx push %r8 push %r9 movdqa (%rsi), %xmm10 movdqa %xmm10, %xmm11 movdqa %xmm10, %xmm12 movdqa %xmm10, %xmm13 movdqa %xmm10, %xmm14 mov $20, %r9 store128_loop: movdqa %xmm10, (%rdx) movdqa %xmm11, (%rdx) movdqa %xmm12, (%rdx) movdqa %xmm13, (%rdx) movdqa %xmm14, (%rdx) movdqa %xmm10, (%rdx) movdqa %xmm11, (%rdx) movdqa %xmm12, (%rdx) movdqa %xmm13, (%rdx) movdqa %xmm14, (%rdx) movdqa %xmm10, (%rdx) movdqa %xmm11, (%rdx) movdqa %xmm12, (%rdx) movdqa %xmm13, (%rdx) movdqa %xmm14, (%rdx) movdqa %xmm10, (%rdx) movdqa %xmm11, (%rdx) movdqa %xmm12, (%rdx) movdqa %xmm13, (%rdx) movdqa %xmm14, (%rdx) sub %r9, %rdi jnz store128_loop pop %r9 pop %r8 pop %rcx pop %rbx ret store256: push %rbx push %rcx push %r8 push %r9 vmovaps (%rsi), %ymm10 vmovaps %ymm10, %ymm11 vmovaps %ymm10, %ymm12 vmovaps %ymm10, %ymm13 vmovaps %ymm10, %ymm14 mov $20, %r9 store256_loop: vmovaps %ymm10, (%rdx) vmovaps %ymm11, (%rdx) vmovaps %ymm12, (%rdx) vmovaps %ymm13, (%rdx) vmovaps %ymm14, (%rdx) vmovaps %ymm10, (%rdx) vmovaps %ymm11, (%rdx) vmovaps %ymm12, (%rdx) vmovaps %ymm13, (%rdx) vmovaps %ymm14, (%rdx) vmovaps %ymm10, (%rdx) vmovaps %ymm11, (%rdx) vmovaps %ymm12, (%rdx) vmovaps %ymm13, (%rdx) vmovaps %ymm14, (%rdx) vmovaps %ymm10, (%rdx) vmovaps %ymm11, (%rdx) vmovaps %ymm12, (%rdx) vmovaps %ymm13, (%rdx) vmovaps %ymm14, (%rdx) sub %r9, %rdi jnz store256_loop pop %r9 pop %r8 pop %rcx pop %rbx ret store512: push %rbx push %rcx push %r8 push %r9 vmovaps (%rsi), %zmm10 vmovaps %zmm10, %zmm11 vmovaps %zmm10, %zmm12 vmovaps %zmm10, %zmm13 vmovaps %zmm10, %zmm14 mov $20, %r9 store512_loop: vmovaps %zmm10, (%rdx) vmovaps %zmm11, (%rdx) vmovaps %zmm12, (%rdx) vmovaps %zmm13, (%rdx) vmovaps %zmm14, (%rdx) vmovaps %zmm10, (%rdx) vmovaps %zmm11, (%rdx) vmovaps %zmm12, (%rdx) vmovaps %zmm13, (%rdx) vmovaps %zmm14, (%rdx) vmovaps %zmm10, (%rdx) vmovaps %zmm11, (%rdx) vmovaps %zmm12, (%rdx) vmovaps %zmm13, (%rdx) vmovaps %zmm14, (%rdx) vmovaps %zmm10, (%rdx) vmovaps %zmm11, (%rdx) vmovaps %zmm12, (%rdx) vmovaps %zmm13, (%rdx) vmovaps %zmm14, (%rdx) sub %r9, %rdi jnz store512_loop pop %r9 pop %r8 pop %rcx pop %rbx ret pdeptest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 pdeptest_loop: pdep %r8, %r15, %r15 pdep %r8, %r14, %r14 pdep %r8, %r13, %r13 pdep %r8, %r12, %r12 pdep %r8, %r11, %r11 pdep %r8, %r10, %r10 pdep %r8, %rcx, %rcx pdep %r8, %rbx, %rbx pdep %r8, %r15, %r15 pdep %r8, %r14, %r14 pdep %r8, %r13, %r13 pdep %r8, %r12, %r12 pdep %r8, %r11, %r11 pdep %r8, %r10, %r10 pdep %r8, %rcx, %rcx pdep %r8, %rbx, %rbx pdep %r8, %r15, %r15 pdep %r8, %r14, %r14 pdep %r8, %r13, %r13 pdep %r8, %r12, %r12 sub %r9, %rdi jnz pdeptest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret pdepmultest: push %rbx push %rcx push %rsi push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %rsi, %rsi xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 pdepmultest_loop: pdep %r8, %r15, %r15 imul %r9, %r14 pdep %r8, %r13, %r13 imul %r9, %r12 pdep %r8, %r11, %r11 imul %r9, %r10 pdep %r8, %rcx, %rcx imul %r9, %rbx pdep %r8, %r15, %r15 imul %r9, %rsi pdep %r8, %r15, %r15 imul %r9, %r14 pdep %r8, %r13, %r13 imul %r9, %r12 pdep %r8, %r11, %r11 imul %r9, %r10 pdep %r8, %rcx, %rcx imul %r9, %rbx pdep %r8, %r15, %r15 imul %r9, %rsi sub %r9, %rdi jnz pdepmultest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rsi pop %rcx pop %rbx ret pexttest: push %rbx push %rcx push %r8 push %r9 push %r10 push %r11 push %r12 push %r13 push %r14 push %r15 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xor %rcx, %rcx xor %r10, %r10 xor %r11, %r11 xor %r12, %r12 xor %r13, %r13 xor %r14, %r14 xor %r15, %r15 pexttest_loop: pext %r8, %r15, %r15 pext %r8, %r14, %r14 pext %r8, %r13, %r13 pext %r8, %r12, %r12 pext %r8, %r11, %r11 pext %r8, %r10, %r10 pext %r8, %rcx, %rcx pext %r8, %rbx, %rbx pext %r8, %r15, %r15 pext %r8, %r14, %r14 pext %r8, %r13, %r13 pext %r8, %r12, %r12 pext %r8, %r11, %r11 pext %r8, %r10, %r10 pext %r8, %rcx, %rcx pext %r8, %rbx, %rbx pext %r8, %r15, %r15 pext %r8, %r14, %r14 pext %r8, %r13, %r13 pext %r8, %r12, %r12 sub %r9, %rdi jnz pexttest_loop pop %r15 pop %r14 pop %r13 pop %r12 pop %r11 pop %r10 pop %r9 pop %r8 pop %rcx pop %rbx ret depmovtest: push %rbx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx depmovtest_loop: mov %r15, %r12 mov %r12, %r14 mov %r14, %r13 mov %r13, %r11 mov %r11, %r15 mov %r15, %r12 mov %r12, %r14 mov %r14, %r13 mov %r13, %r11 mov %r11, %r15 mov %r15, %r12 mov %r12, %r14 mov %r14, %r13 mov %r13, %r11 mov %r11, %r15 mov %r15, %r12 mov %r12, %r14 mov %r14, %r13 mov %r13, %r11 mov %r11, %r15 sub %r9, %rdi jnz depmovtest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rbx ret vecdepmovtest: push %rbx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 cvtsi2ss %r9, %xmm15 xor %rbx, %rbx vecdepmovtest_loop: movaps %xmm15, %xmm12 movaps %xmm12, %xmm14 movaps %xmm14, %xmm13 movaps %xmm13, %xmm11 movaps %xmm11, %xmm15 movaps %xmm15, %xmm12 movaps %xmm12, %xmm14 movaps %xmm14, %xmm13 movaps %xmm13, %xmm11 movaps %xmm11, %xmm15 movaps %xmm15, %xmm12 movaps %xmm12, %xmm14 movaps %xmm14, %xmm13 movaps %xmm13, %xmm11 movaps %xmm11, %xmm15 movaps %xmm15, %xmm12 movaps %xmm12, %xmm14 movaps %xmm14, %xmm13 movaps %xmm13, %xmm11 movaps %xmm11, %xmm15 sub %r9, %rdi jnz vecdepmovtest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rbx ret vecindepmovtest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 cvtsi2ss %r9, %xmm0 movaps %xmm0, %xmm1 movaps %xmm0, %xmm10 movaps %xmm0, %xmm11 movaps %xmm0, %xmm12 xor %rbx, %rbx vecindepmovtest_loop: movaps %xmm10, %xmm15 movaps %xmm11, %xmm14 movaps %xmm12, %xmm13 movaps %xmm0, %xmm15 movaps %xmm1, %xmm14 movaps %xmm10, %xmm15 movaps %xmm11, %xmm14 movaps %xmm12, %xmm13 movaps %xmm0, %xmm15 movaps %xmm1, %xmm14 movaps %xmm10, %xmm15 movaps %xmm11, %xmm14 movaps %xmm12, %xmm13 movaps %xmm0, %xmm15 movaps %xmm1, %xmm14 movaps %xmm10, %xmm15 movaps %xmm11, %xmm14 movaps %xmm12, %xmm13 movaps %xmm0, %xmm15 movaps %xmm1, %xmm14 sub %r9, %rdi jnz vecindepmovtest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret indepmovtest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx indepmovtest_loop: mov %r10, %r15 mov %r11, %r14 mov %r12, %r13 mov %rax, %r15 mov %rcx, %r14 mov %r10, %r15 mov %r11, %r14 mov %r12, %r13 mov %rax, %r15 mov %rcx, %r14 mov %r10, %r15 mov %r11, %r14 mov %r12, %r13 mov %rax, %r15 mov %rcx, %r14 mov %r10, %r15 mov %r11, %r14 mov %r12, %r13 mov %rax, %r15 mov %rcx, %r14 sub %r9, %rdi jnz indepmovtest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret movzerotest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx movzerotest_loop: mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 mov $0, %r15 sub %r9, %rdi jnz movzerotest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret xorzerotest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx xorzerotest_loop: xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 xor %r15, %r15 sub %r9, %rdi jnz xorzerotest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret vecxorzerotest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 cvtsi2ss %r9, %xmm0 xor %rbx, %rbx vecxorzerotest_loop: xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 xorps %xmm0, %xmm0 sub %r9, %rdi jnz vecxorzerotest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret subzerotest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx subzerotest_loop: sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r15, %r15 sub %r9, %rdi jnz subzerotest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret vecsubzerotest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 cvtsi2ss %r9, %xmm15 xor %rbx, %rbx vecsubzerotest_loop: subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 subss %xmm15, %xmm15 sub %r9, %rdi jnz subzerotest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret depaddimmtest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx depaddimmtest_loop: add $1, %r15 add $2, %r15 add $3, %r15 add $4, %r15 add $5, %r15 add $6, %r15 add $7, %r15 add $8, %r15 add $9, %r15 add $10, %r15 add $11, %r15 add $12, %r15 add $13, %r15 add $14, %r15 add $15, %r15 add $16, %r15 add $17, %r15 add $18, %r15 add $19, %r15 add $20, %r15 sub %r9, %rdi jnz depaddimmtest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret memrenametest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $10, %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx memrenametest_loop: .rept 20 mov %r10, (%rsi) mov (%rsi), %r10 .endr sub %r9, %rdi jnz memrenametest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret depinctest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %rbx, %rbx depinctest_loop: inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 inc %r15 sub %r9, %rdi jnz depinctest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret depdectest: push %rbx push %rcx push %r8 push %r9 push %r15 push %r14 push %r13 push %r12 push %r11 push %r10 mov $1, %r8 mov $20, %r9 xor %r15, %r15 not %r15 xor %rbx, %rbx depdectest_loop: dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 dec %r15 sub %r9, %rdi jnz depdectest_loop pop %r10 pop %r11 pop %r12 pop %r13 pop %r14 pop %r15 pop %r9 pop %r8 pop %rcx pop %rbx ret /* FMA4 tests */ fma4_256: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %ymm6 vmovups %ymm6, %ymm5 vmovups %ymm6, %ymm7 vmovups %ymm6, %ymm8 vmovups %ymm6, %ymm9 vmovups %ymm6, %ymm10 vmovups %ymm6, %ymm11 vmovups %ymm6, %ymm12 vmovups %ymm6, %ymm13 vmovups %ymm6, %ymm14 vmovups %ymm6, %ymm15 fma4_256_loop: vfmaddps %ymm6, %ymm6, %ymm5, %ymm5 vfmaddps %ymm6, %ymm6, %ymm7, %ymm7 vfmaddps %ymm6, %ymm6, %ymm8, %ymm8 vfmaddps %ymm6, %ymm6, %ymm9, %ymm9 vfmaddps %ymm6, %ymm6, %ymm10, %ymm10 vfmaddps %ymm6, %ymm6, %ymm11, %ymm11 vfmaddps %ymm6, %ymm6, %ymm12, %ymm12 vfmaddps %ymm6, %ymm6, %ymm13, %ymm13 vfmaddps %ymm6, %ymm6, %ymm14, %ymm14 vfmaddps %ymm6, %ymm6, %ymm15, %ymm15 vfmaddps %ymm6, %ymm6, %ymm5, %ymm5 vfmaddps %ymm6, %ymm6, %ymm7, %ymm7 vfmaddps %ymm6, %ymm6, %ymm8, %ymm8 vfmaddps %ymm6, %ymm6, %ymm9, %ymm9 vfmaddps %ymm6, %ymm6, %ymm10, %ymm10 vfmaddps %ymm6, %ymm6, %ymm11, %ymm11 vfmaddps %ymm6, %ymm6, %ymm12, %ymm12 vfmaddps %ymm6, %ymm6, %ymm13, %ymm13 vfmaddps %ymm6, %ymm6, %ymm14, %ymm14 vfmaddps %ymm6, %ymm6, %ymm15, %ymm15 sub %r9, %rdi jnz fma4_256_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fma4_128: push %r9 push %r8 mov $20, %r9 movq %r9, %xmm1 cvtsi2ss %r9, %xmm6 movups %xmm6, -32(%rsp) vbroadcastss -32(%rsp), %xmm6 movups %xmm6, %xmm5 movups %xmm6, %xmm7 movups %xmm6, %xmm8 movups %xmm6, %xmm9 movups %xmm6, %xmm10 movups %xmm6, %xmm11 movups %xmm6, %xmm12 movups %xmm6, %xmm13 movups %xmm6, %xmm14 movups %xmm6, %xmm15 fma4_128_loop: vfmaddps %xmm6, %xmm6, %xmm5, %xmm5 vfmaddps %xmm6, %xmm6, %xmm7, %xmm7 vfmaddps %xmm6, %xmm6, %xmm8, %xmm8 vfmaddps %xmm6, %xmm6, %xmm9, %xmm9 vfmaddps %xmm6, %xmm6, %xmm10, %xmm10 vfmaddps %xmm6, %xmm6, %xmm11, %xmm11 vfmaddps %xmm6, %xmm6, %xmm12, %xmm12 vfmaddps %xmm6, %xmm6, %xmm13, %xmm13 vfmaddps %xmm6, %xmm6, %xmm14, %xmm14 vfmaddps %xmm6, %xmm6, %xmm15, %xmm15 vfmaddps %xmm6, %xmm6, %xmm5, %xmm5 vfmaddps %xmm6, %xmm6, %xmm7, %xmm7 vfmaddps %xmm6, %xmm6, %xmm8, %xmm8 vfmaddps %xmm6, %xmm6, %xmm9, %xmm9 vfmaddps %xmm6, %xmm6, %xmm10, %xmm10 vfmaddps %xmm6, %xmm6, %xmm11, %xmm11 vfmaddps %xmm6, %xmm6, %xmm12, %xmm12 vfmaddps %xmm6, %xmm6, %xmm13, %xmm13 vfmaddps %xmm6, %xmm6, %xmm14, %xmm14 vfmaddps %xmm6, %xmm6, %xmm15, %xmm15 sub %r9, %rdi jnz fma4_128_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fdivtest: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm6 movss %xmm6, %xmm5 movss %xmm6, %xmm7 movss %xmm6, %xmm8 movss %xmm6, %xmm9 movss %xmm6, %xmm10 movss %xmm6, %xmm11 movss %xmm6, %xmm12 movss %xmm6, %xmm13 movss %xmm6, %xmm14 movss %xmm6, %xmm15 fdivtest_loop: divss %xmm6, %xmm5 divss %xmm6, %xmm7 divss %xmm6, %xmm8 divss %xmm6, %xmm9 divss %xmm6, %xmm10 divss %xmm6, %xmm11 divss %xmm6, %xmm12 divss %xmm6, %xmm13 divss %xmm6, %xmm14 divss %xmm6, %xmm15 divss %xmm6, %xmm5 divss %xmm6, %xmm7 divss %xmm6, %xmm8 divss %xmm6, %xmm9 divss %xmm6, %xmm10 divss %xmm6, %xmm11 divss %xmm6, %xmm12 divss %xmm6, %xmm13 divss %xmm6, %xmm14 divss %xmm6, %xmm15 sub %r9, %rdi jnz fdivtest_loop movq %xmm1, %rax pop %r8 pop %r9 ret fdivlattest: push %r9 push %r8 mov $20, %r9 cvtsi2ss %r9, %xmm6 fdivlattest_loop: divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 divss %xmm6, %xmm6 sub %r9, %rdi jnz fdivtest_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fmuldenormlattest: push %r9 push %r8 mov $0x00800000, %r9 /* smallest normal */ mov $0x3f000000, %r8 /* 0.5 */ movq %r9, %xmm6 movq %r8, %xmm7 mov $0x40000000, %r8 /* 2 */ movq %r8, %xmm4 mov $20, %r9 fmuldenormlattest_loop: mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm5, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 mulss %xmm4, %xmm6 sub %r9, %rdi jnz fmuldenormlattest_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret fmuldenormtest: push %r9 push %r8 mov $0x00800000, %r9 mov $0x3e4ccccd, %r8 movq %r9, %xmm6 movq %r8, %xmm7 movaps %xmm7, %xmm5 mov $20, %r9 fmuldenormtest_loop: mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 mulss %xmm6, %xmm5 movaps %xmm7, %xmm5 sub %r9, %rdi jnz fmuldenormtest_loop movq %xmm1, %rax vzeroupper pop %r8 pop %r9 ret movqtoxmmtest: push %r9 push %r8 push %r10 mov $20, %r9 mov $123, %r10 movqtoxmmtest_loop: movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 movq %r10, %xmm1 movq %xmm1, %r10 sub %r9, %rdi jnz movqtoxmmtest_loop movq %xmm1, %rax pop %r10 pop %r8 pop %r9 ret ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.asm ================================================ section .text bits 64 global asm_read ; rcx = ptr to array ; rdx = array length in bytes ; r8 = stop flag ; r9 = throttle factor ; return bytes read in rax asm_read: push rdi push rsi push r10 push r11 mov rdi, rcx ; save array base address xor rsi, rsi ; index xor rax, rax ; return value asm_read_pass_loop: movups xmm0, [rdi] movups xmm0, [rdi + 16] movups xmm0, [rdi + 32] movups xmm0, [rdi + 48] movups xmm0, [rdi + 64] movups xmm0, [rdi + 80] movups xmm0, [rdi + 96] movups xmm0, [rdi + 112] add rdi, 128 add rsi, 128 ; update index add rax, 128 ; update return value test r9, r9 ; need to throttle? jz asm_read_throttle_end mov r10, r9 asm_read_throttle: dec r10 jnz asm_read_throttle; asm_read_throttle_end: mov r10d, [r8] ; check stop flag test r10d, r10d jnz asm_read_end cmp rdx, rsi ; array len - index > 0? jg asm_read_pass_loop mov rdi, rcx ; reset to start xor rsi, rsi ; and reset index jmp asm_read_pass_loop asm_read_end: pop r11 pop r10 pop rsi pop rdi ret ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.cpp ================================================ #include #include #include #include #include #include #include #include #define CACHELINE_SIZE 64 struct BandwidthTestThreadData { uint64_t read_bytes; uint64_t arr_length_bytes; char* arr; volatile int* flag; HANDLE threadHandle; }; struct LatencyTestData { uint32_t iterations; uint32_t* arr; float latency; HANDLE threadHandle; }; extern "C" uint64_t asm_read(char* arr, uint64_t arr_length, volatile int* flag, int waitfactor); DWORD ReadBandwidthTestThread(void* param); DWORD FillBandwidthTestArr(void* param); void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment); DWORD RunLatencyTest(void* param); bool GetPrivilege(); float RunTest(uint64_t latencyAffinityMask, uint64_t bwAffinityMask, int bwThreadCount, int hugepages, float* measuredBw); void StartMonitoring(); void EndMonitoring(); void SetupMonitoring(); void CloseMonitoring(); uint64_t BandwidthTestMemoryKB = 1048576 * 4; uint64_t LatencyTestMemoryKB = 1048576; uint64_t LatencyTestIterations = 1e5; uint64_t throttle = 0; int main(int argc, char* argv[]) { SYSTEM_INFO sysInfo; GetSystemInfo(&sysInfo); int bwThreadCap = sysInfo.dwNumberOfProcessors - 1; int coreCount = sysInfo.dwNumberOfProcessors; int latencyCore = 0; int* customCores = NULL; if (argc == 1) { fprintf(stderr, "Options:\n"); fprintf(stderr, "-bwthreads [int]: Number of bandwidth test threads\n"); fprintf(stderr, "-latencyaffinity [int]: Core to run latency test thread on\n"); fprintf(stderr, "-bwcores [comma separated list]: Cores to run bandwidth load on\n"); fprintf(stderr, "-scaleiterations [int]: Iterations scaling factor\n"); fprintf(stderr, "-throttle [int]: Reduce bandwidth load per bandwidth test thread\n"); } for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char* arg = argv[argIdx] + 1; if (strncmp(arg, "bwthreads", 9) == 0) { argIdx++; bwThreadCap = atoi(argv[argIdx]); fprintf(stderr, "Using up to %d bw threads\n", bwThreadCap); } else if (strncmp(arg, "latencyaffinity", 15) == 0) { argIdx++; latencyCore = atoi(argv[argIdx]); fprintf(stderr, "Latency test thread will run in core %d\n", latencyCore); } else if (strncmp(arg, "scaleiterations", 15) == 0) { argIdx++; int scaleFactor = atoi(argv[argIdx]); LatencyTestIterations *= scaleFactor; fprintf(stderr, "Scaling iterations up by a factor of %d\n", scaleFactor); } else if (strncmp(arg, "throttle", 8) == 0) { argIdx++; throttle = atoi(argv[argIdx]); fprintf(stderr, "Pulling memory bandwidth test threads back, factor of %lld\n", throttle); } else if (strncmp(arg, "bwcores", 7) == 0) { argIdx++; char* customCoreListStr = argv[argIdx]; bwThreadCap = 1; for (int i = 0; customCoreListStr[i] != 0; i++) { // shell should null terminate this if (customCoreListStr[i] == ',') { bwThreadCap++; } } customCores = (int*)malloc(sizeof(int) * bwThreadCap); memset(customCores, 0, sizeof(int) * bwThreadCap); int commaIdx = 1; for (int i = 0; customCoreListStr[i] != 0; i++) { if (customCoreListStr[i] == ',') { customCores[commaIdx] = i + 1; commaIdx++; customCoreListStr[i] = '\0'; } } fprintf(stderr, "Cores used for bandwidth load:"); for (int i = 0; i < bwThreadCap; i++) { customCores[i] = atoi(customCoreListStr + customCores[i]); fprintf(stderr, " %d", customCores[i]); } fprintf(stderr, "\n"); } } } GetPrivilege(); //SetupMonitoring(); uint64_t latencyAffinityMask = 1UL << latencyCore; uint64_t bwAffinityMask = 0; fprintf(stderr, "%d cores, will use up to %d for BW threads\n", coreCount, bwThreadCap); float* latencies = (float*)malloc(sizeof(float) * bwThreadCap + 1); float* bandwidths = (float*)malloc(sizeof(float) * bwThreadCap + 1); for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) { float bw; int nextCore; if (bwThreadCount > 0) { if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1; else nextCore = customCores[bwThreadCount - 1]; fprintf(stderr, "next core is %d\n", nextCore); bwAffinityMask |= 1UL << nextCore; } float latencyNs = RunTest(latencyAffinityMask, bwAffinityMask, bwThreadCount, 1, &bw); fprintf(stderr, "%d bw threads %f GB/s %f ns\n", bwThreadCount, bw, latencyNs); latencies[bwThreadCount] = latencyNs; bandwidths[bwThreadCount] = bw; } printf("BW Threads, Bandwidth (GB/s), Latency (ns)\n"); for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) { printf("%d, %f, %f\n", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]); } free(latencies); free(bandwidths); if (customCores != NULL) free(customCores); //CloseMonitoring(); return 0; } // returns latency in ns // sets measuredBw = measured bandwidth float RunTest(uint64_t latencyAffinity, uint64_t bwAffinity, int bwThreadCount, int hugepages, float* measuredBw) { uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024; volatile int flag = 0; // set 1 to stop struct timeb start, end; int map_failed = 0; // MT bw test array fill struct BandwidthTestThreadData* bandwidthTestData = (struct BandwidthTestThreadData*)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount); HANDLE* threadHandles = (HANDLE*)malloc(sizeof(HANDLE) * bwThreadCount); for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { bandwidthTestData[threadIdx].read_bytes = 0; bandwidthTestData[threadIdx].flag = &flag; bandwidthTestData[threadIdx].arr = (char*)malloc(perThreadArrSizeBytes); bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes; threadHandles[threadIdx] = CreateThread(NULL, 0, FillBandwidthTestArr, bandwidthTestData + threadIdx, 0, NULL); } // set up latency test uint32_t* latencyArr; latencyArr = (uint32_t *)VirtualAlloc(NULL, LatencyTestMemoryKB * 1024, MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES, PAGE_READWRITE); if (latencyArr == NULL) { // MAP_FAILED fprintf(stderr, "Failed to get memory via VirtualAlloc. Using plain malloc\n"); latencyArr = (uint32_t *)malloc(LatencyTestMemoryKB * 1024); map_failed = 1; } struct LatencyTestData latencyTestData; latencyTestData.iterations = LatencyTestIterations; latencyTestData.latency = 0.0f; latencyTestData.arr = latencyArr; FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE); WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE); for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) threadHandles[threadIdx] = INVALID_HANDLE_VALUE; // create bw test threads for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { threadHandles[threadIdx] = CreateThread(NULL, 0, ReadBandwidthTestThread, bandwidthTestData + threadIdx, CREATE_SUSPENDED, NULL); SetThreadAffinityMask(threadHandles[threadIdx], bwAffinity); } //StartMonitoring(); ftime(&start); // start bw test threads for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { ResumeThread(threadHandles[threadIdx]); } HANDLE latencyThreadHandle = CreateThread(NULL, 0, RunLatencyTest, (void*)&latencyTestData, CREATE_SUSPENDED, NULL); SetThreadAffinityMask(latencyThreadHandle, latencyAffinity); ResumeThread(latencyThreadHandle); WaitForSingleObject(latencyThreadHandle, INFINITE); flag = 1; WaitForMultipleObjects(bwThreadCount, threadHandles, true, INFINITE); ftime(&end); //EndMonitoring(); // count on a cacheline basis even though the test only loads 4B at a time uint64_t latencyReadBytes = 64 * LatencyTestIterations; uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); float totalReadData = (float)latencyReadBytes; float bwReadBytes = 0.0f; for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { free(bandwidthTestData[threadIdx].arr); totalReadData += (float)bandwidthTestData[threadIdx].read_bytes; bwReadBytes += (float)bandwidthTestData[threadIdx].read_bytes; } *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; float bwBandwidth = 1000 * (bwReadBytes / (float)1e9) / (float)time_diff_ms; float latencyBandwidth = 1000 * (latencyReadBytes / (float)1e9) / (float)time_diff_ms; fprintf(stderr, "%d bw threads - %f BW bandwidth, %f latency bandwidth\n", bwThreadCount, bwBandwidth, latencyBandwidth); free(bandwidthTestData); if (map_failed) free(latencyArr); else VirtualFree(latencyArr, 0, MEM_RELEASE); return latencyTestData.latency; } void FillPatternArr(uint32_t* pattern_arr, uint32_t list_size, uint32_t byte_increment) { uint32_t increment = byte_increment / sizeof(uint32_t); uint32_t element_count = list_size / increment; for (int i = 0; i < element_count; i++) { pattern_arr[i * increment] = i * increment; } int iter = element_count; while (iter > 1) { iter -= 1; int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); uint32_t tmp = pattern_arr[iter * increment]; pattern_arr[iter * increment] = pattern_arr[j * increment]; pattern_arr[j * increment] = tmp; } } // No need for simple addressing because this test should be operating well in DRAM // where an extra cycle for indexed addressing should not make a big difference // returns load to use latency in nanoseconds // size_kb should be divisible by 2M, or whatever the hugepage size is DWORD RunLatencyTest(void* param) { struct timeb start, end; struct LatencyTestData* testData = (struct LatencyTestData*)param; uint32_t* A = testData->arr; uint32_t iterations = testData->iterations; uint32_t sum = 0, current; // Run test ftime(&start); current = A[0]; for (int i = 0; i < iterations; i++) { current = A[current]; sum += current; } ftime(&end); uint64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); testData->latency = 1e6 * (float)time_diff_ms / (float)iterations; return sum; } DWORD FillBandwidthTestArr(void* param) { struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param; float* arr = (float*)bwTestData->arr; uint64_t float_elements = bwTestData->arr_length_bytes / 4; for (int i = 0; i < float_elements; i++) { arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f; } return 0; } DWORD ReadBandwidthTestThread(void* param) { struct BandwidthTestThreadData* bwTestData = (struct BandwidthTestThreadData*)param; uint64_t totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle); bwTestData->read_bytes = totalDataBytes; return 0; } // For winring0 #define RDMSR_FUNCTION 0x821 #define WRMSR_FUNCTION 0x822 #define WINRING0_DEVICE_TYPE 40000 HANDLE driverHandle = INVALID_HANDLE_VALUE; void SetupMonitoring() { driverHandle = CreateFileA("\\\\.\\WinRing0_1_2_0", FILE_SHARE_READ | FILE_SHARE_WRITE, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if (driverHandle == INVALID_HANDLE_VALUE) { fprintf(stderr, "Could not open WinRing0 driver: %d\n", GetLastError()); } } uint64_t ReadMsr(uint32_t index) { uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (RDMSR_FUNCTION << 2); uint64_t rc; DWORD bytesReturned; if (!DeviceIoControl(driverHandle, code, &index, sizeof(uint32_t), &rc, sizeof(uint64_t), &bytesReturned, NULL)) { fprintf(stderr, "ReadMsr failed (ioctl returned false)\n"); } return rc; } void WriteMsr(uint32_t index, uint64_t value) { uint32_t code = (WINRING0_DEVICE_TYPE << 16) | (WRMSR_FUNCTION << 2); char inputBuffer[sizeof(uint32_t) + sizeof(uint64_t)]; *(uint32_t*)inputBuffer = index; *(uint64_t*)(inputBuffer + sizeof(uint32_t)) = value; DWORD bytesReturned; if (!DeviceIoControl(driverHandle, code, &inputBuffer, sizeof(uint32_t) + sizeof(uint64_t), NULL, 0, &bytesReturned, NULL)) { fprintf(stderr, "WriteMsr failed (ioctl returned false)\n"); } } #define L3_PERF_CTL0 0xC0010230 #define L3_PERF_CTL1 0xC0010232 #define L3_PERF_CTL2 0xC0010234 #define L3_PERF_CTL3 0xC0010236 #define L3_PERF_CTR0 0xC0010231 #define L3_PERF_CTR1 0xC0010233 #define L3_PERF_CTR2 0xC0010235 #define L3_PERF_CTR3 0xC0010237 void ClearL3Counters() { WriteMsr(L3_PERF_CTR0, 0); WriteMsr(L3_PERF_CTR1, 0); WriteMsr(L3_PERF_CTR2, 0); WriteMsr(L3_PERF_CTR3, 0); } void StartMonitoring() { uint64_t l3access = 0x0300c0000040ff04; uint64_t l3miss = 0x0300c00000400104; uint64_t l3miss_sampled_dram_req = 0x0303c000004003ad; uint64_t l3miss_sampled_dram_req_latency = 0x0303c000004003ac; SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0 WriteMsr(L3_PERF_CTL0, l3access); WriteMsr(L3_PERF_CTL1, l3miss); WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req); WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency); ClearL3Counters(); SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1 WriteMsr(L3_PERF_CTL0, l3access); WriteMsr(L3_PERF_CTL1, l3miss); WriteMsr(L3_PERF_CTL2, l3miss_sampled_dram_req); WriteMsr(L3_PERF_CTL3, l3miss_sampled_dram_req_latency); ClearL3Counters(); } void EndMonitoring() { SetThreadAffinityMask(GetCurrentThread(), 1); // use core 0 in ccd 0 uint64_t ccd0L3Access = ReadMsr(L3_PERF_CTR0); uint64_t ccd0L3Miss = ReadMsr(L3_PERF_CTR1); uint64_t ccd0L3SampledDramReq = ReadMsr(L3_PERF_CTR2); uint64_t ccd0L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3); float ccd0SampledLatencyNs = 10.0f * ccd0L3SampledDramReqLatency / ccd0L3SampledDramReq; ClearL3Counters(); SetThreadAffinityMask(GetCurrentThread(), 16); // use core 0 in ccd 1 uint64_t ccd1L3Access = ReadMsr(L3_PERF_CTR0); uint64_t ccd1L3Miss = ReadMsr(L3_PERF_CTR1); uint64_t ccd1L3SampledDramReq = ReadMsr(L3_PERF_CTR2); uint64_t ccd1L3SampledDramReqLatency = ReadMsr(L3_PERF_CTR3); float ccd1SampledLatencyNs = 10.0f * ccd1L3SampledDramReqLatency / ccd1L3SampledDramReq; ClearL3Counters(); fprintf(stderr, "CCD 0: %f ns, CCD1: %f ns\n", ccd0SampledLatencyNs, ccd1SampledLatencyNs); } void CloseMonitoring() { if (driverHandle != INVALID_HANDLE_VALUE) CloseHandle(driverHandle); driverHandle = INVALID_HANDLE_VALUE; } bool GetPrivilege() { HANDLE hToken; TOKEN_PRIVILEGES tp; BOOL status; DWORD error; // open process token if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &hToken)) { fprintf(stderr, "OpenProcessToken failed: %d\n", GetLastError()); return false; } // get the luid if (!LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid)) { fprintf(stderr, "Could not get luid: %d\n", GetLastError()); return false; } // enable privilege tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; status = AdjustTokenPrivileges(hToken, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); // It is possible for AdjustTokenPrivileges to return TRUE and still not succeed. // So always check for the last error value. error = GetLastError(); if (!status || (error != ERROR_SUCCESS)) { fprintf(stderr, "AdjustTokenPrivileges failed with status %d, error %d\n", status, error); return false; } // close the handle if (!CloseHandle(hToken)) { fprintf(stderr, "CloseHandle failed: %d\n", GetLastError()); return false; } fprintf(stderr, "Got SeLockMemoryPrivilege\n"); } ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.11.35327.3 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LoadedMemoryLatency", "LoadedMemoryLatency.vcxproj", "{E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.ActiveCfg = Debug|x64 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x64.Build.0 = Debug|x64 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.ActiveCfg = Debug|Win32 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Debug|x86.Build.0 = Debug|Win32 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.ActiveCfg = Release|x64 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x64.Build.0 = Release|x64 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.ActiveCfg = Release|Win32 {E7B51ED8-5C4A-4CB5-9874-DC4B9CAF056D}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {5656BCBF-7F82-471C-8AFE-1FE48AD34114} EndGlobalSection EndGlobal ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj ================================================  Debug Win32 Release Win32 Debug x64 Release x64 17.0 Win32Proj {e7b51ed8-5c4a-4cb5-9874-dc4b9caf056d} LoadedMemoryLatency 10.0 Application true v143 Unicode Application false v143 true Unicode Application true v143 Unicode Application false v143 true Unicode Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Document Running NASM nasm -f win64 LoadedMemoryLatency.asm Running NASM nasm -f win64 LoadedMemoryLatency.asm false false LoadedMemoryLatency.obj LoadedMemoryLatency.obj ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency/LoadedMemoryLatency.vcxproj.filters ================================================  {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms Source Files Source Files ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency.c ================================================ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #define CACHELINE_SIZE 64 enum TestMethod { Read, Add }; struct BandwidthTestThreadData { uint64_t read_bytes; uint64_t arr_length_bytes; char *arr; volatile int *flag; cpu_set_t cpuset; pthread_t handle; enum TestMethod test_method; }; struct LatencyTestData { uint32_t iterations; uint32_t *arr; float latency; cpu_set_t cpuset; pthread_t handle; }; int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 512, 600, 768, 1024, 1536, 2048, 2304, 2560, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 13312, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 65536, 98304, 131072, 262144, 393216, 524288, 1048576 }; extern uint64_t asm_read(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); extern uint64_t asm_add(char *arr, uint64_t arr_length, volatile int *flag, int waitfactor) __attribute__((ms_abi)); void *ReadBandwidthTestThread(void *param); void *FillBandwidthTestArr(void *param); void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment); void *RunLatencyTest(void *param); float RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw); float RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb); uint64_t BandwidthTestMemoryKB = 1048576; uint64_t LatencyTestMemoryKB = 1048576; uint64_t LatencyTestIterations = 1e5; uint64_t throttle = 0; enum TestMethod testMethod = Read; int main(int argc, char *argv[]) { int bwThreadCap = get_nprocs() - 1; int coreCount = get_nprocs(); int latencyCore = 0; int *customCores = NULL; int sharedLatency = 0, bwonly = 0; if (argc == 1) { fprintf(stderr, "Options:\n"); fprintf(stderr, "-bwthreads [int]: Number of bandwidth test threads\n"); fprintf(stderr, "-latencyaffinity [int]: Core to run latency test thread on\n"); fprintf(stderr, "-bwcores [comma separated list]: Cores to run bandwidth load on\n"); fprintf(stderr, "-scaleiterations [int]: Iterations scaling factor\n"); fprintf(stderr, "-throttle [int]: Reduce bandwidth load per bandwidth test thread\n"); } for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "bwthreads", 9) == 0) { argIdx++; bwThreadCap = atoi(argv[argIdx]); fprintf(stderr, "Using up to %d bw threads\n", bwThreadCap); } else if (strncmp(arg, "latencyaffinity", 15) == 0) { argIdx++; latencyCore = atoi(argv[argIdx]); fprintf(stderr, "Latency test thread will run in core %d\n", latencyCore); } else if (strncmp(arg, "scaleiterations", 15) == 0) { argIdx++; int scaleFactor = atoi(argv[argIdx]); LatencyTestIterations *= scaleFactor; fprintf(stderr, "Scaling iterations up by a factor of %d\n", scaleFactor); } else if (strncmp(arg, "throttle", 8) == 0) { argIdx++; throttle = atol(argv[argIdx]); fprintf(stderr, "Pulling memory bandwidth test threads back, factor of %lu\n", throttle); } else if (strncmp(arg, "bwcores", 7) == 0) { argIdx++; char *customCoreListStr = argv[argIdx]; bwThreadCap = 1; for (int i = 0; customCoreListStr[i] != 0; i++) { // shell should null terminate this if (customCoreListStr[i] == ',') { bwThreadCap++; } } customCores = (int *)malloc(sizeof(int) * bwThreadCap); memset(customCores, 0, sizeof(int) * bwThreadCap); int commaIdx = 1; for (int i = 0; customCoreListStr[i] != 0; i++) { if (customCoreListStr[i] == ',') { customCores[commaIdx] = i + 1; commaIdx++; customCoreListStr[i] = '\0'; } } fprintf(stderr, "Cores used for bandwidth load:"); for (int i = 0; i < bwThreadCap; i++) { customCores[i] = atoi(customCoreListStr + customCores[i]); fprintf(stderr, " %d", customCores[i]); } fprintf(stderr, "\n"); } else if (strncmp(arg, "sharedlatency", 13) == 0) { fprintf(stderr, "Shared arr bw+latency\n"); sharedLatency = 1; } else if (strncmp(arg, "bwonly", 6) == 0) { fprintf(stderr, "Only testing bandwidth\n"); bwonly = 1; } else if (strncmp(arg, "method", 6) == 0) { argIdx++; if (strncmp(argv[argIdx], "read", 4) == 0) { testMethod = Read; fprintf(stderr, "Testing with reads\n"); } else if (strncmp(argv[argIdx], "add", 3) == 0) { testMethod = Add; fprintf(stderr, "Testing with adds (RMW)\n"); } } } } cpu_set_t latency_cpuset; CPU_ZERO(&latency_cpuset); CPU_SET(latencyCore, &latency_cpuset); cpu_set_t bw_cpuset; CPU_ZERO(&bw_cpuset); if (bwonly) { fprintf(stderr, "Only testing bandwidth to abuse the iteration logic\n"); int testSizeCount = sizeof(default_test_sizes) / sizeof(int); float *bandwidths = (float *)malloc(sizeof(float) * testSizeCount); memset(bandwidths, 0, sizeof(float) * testSizeCount); // set the entire affinity mask right away for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) { int nextCore; if (bwThreadCount > 0) { if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1; else nextCore = customCores[bwThreadCount - 1] ; fprintf(stderr, "next core is %d\n", nextCore); CPU_SET(nextCore, &bw_cpuset); } } for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { int testSizeKb = default_test_sizes[testSizeIdx]; if (testSizeKb < bwThreadCap) { fprintf(stderr, "Skipping size %d because it's too small for specified thread count of %d\n", testSizeKb, bwThreadCap); continue; } float bandwidth = RunBandwidthOnlyTest(bw_cpuset, bwThreadCap, testSizeKb); bandwidths[testSizeIdx] = bandwidth; fprintf(stderr, "Test Size %d KB: %f GB/s\n", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]); } for (int testSizeIdx = 0; testSizeIdx < testSizeCount; testSizeIdx++) { if (bandwidths[testSizeIdx] == 0.0f) continue; printf("%d,%f\n", default_test_sizes[testSizeIdx], bandwidths[testSizeIdx]); } } else if (!sharedLatency) { fprintf(stderr, "%d cores, will use up to %d for BW threads\n", coreCount, bwThreadCap); float *latencies = (float *)malloc(sizeof(float) * bwThreadCap + 1); float *bandwidths = (float *)malloc(sizeof(float) * bwThreadCap + 1); for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) { float bw; int nextCore; if (bwThreadCount > 0) { if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1; else nextCore = customCores[bwThreadCount - 1] ; fprintf(stderr, "next core is %d\n", nextCore); CPU_SET(nextCore, &bw_cpuset); } if (nextCore < 0) break; // sharedlatency will always be false in this run mode float latencyNs = RunTest(latency_cpuset, bw_cpuset, bwThreadCount, 1, sharedLatency, &bw); fprintf(stderr, "%d bw threads %f GB/s %f ns\n", bwThreadCount, bw, latencyNs); latencies[bwThreadCount] = latencyNs; bandwidths[bwThreadCount] = bw; } printf("BW Threads, Bandwidth (GB/s), Latency (ns)\n"); for (int bwThreadCount = 0; bwThreadCount <= bwThreadCap; bwThreadCount++) { printf("%d, %f, %f\n", bwThreadCount, bandwidths[bwThreadCount], latencies[bwThreadCount]); } free(latencies); free(bandwidths); } else { int testSizeCount = sizeof(default_test_sizes) / sizeof(int); float *latencies = (float*)malloc(sizeof(float) * testSizeCount); float *bandwidths = (float*)malloc(sizeof(float) * testSizeCount); // set mask to all selected cores for (int bwThreadCount = 0; bwThreadCount < bwThreadCap; bwThreadCount++) { int nextCore; if (customCores == NULL) nextCore = coreCount - bwThreadCount - 1; else nextCore = customCores[bwThreadCount]; CPU_SET(nextCore, &bw_cpuset); fprintf(stderr, "Set core %d\n", nextCore); } for (int i = 0; i < testSizeCount; i++) { LatencyTestMemoryKB = default_test_sizes[i]; latencies[i] = RunTest(latency_cpuset, bw_cpuset, bwThreadCap, 1, sharedLatency, bandwidths + i); fprintf(stderr, "%lu KB: %f ns %f GB/s\n", LatencyTestMemoryKB, latencies[i], bandwidths[i]); } printf("Test Size (KB), Latency (ns), Bandwidth (GB/s)\n"); for (int i = 0; i < testSizeCount; i++) { printf("%d,%f,%f\n", default_test_sizes[i], latencies[i], bandwidths[i]); } free(latencies); free(bandwidths); } if (customCores != NULL) free(customCores); return 0; } // Caller ensures at least 1 KB per thread. Runs in private mode float RunBandwidthOnlyTest(cpu_set_t bwAffinity, int bwThreadCount, int sizeKb) { volatile int flag = 0; struct timeval startTv, endTv; struct timezone startTz, endTz; struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount); uint64_t perThreadArrSizeBytes = ceil((double)sizeKb / (double)bwThreadCount) * 1024; // Same initialization routine for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { bandwidthTestData[threadIdx].read_bytes = 0; bandwidthTestData[threadIdx].test_method = testMethod; bandwidthTestData[threadIdx].flag = &flag; bandwidthTestData[threadIdx].cpuset = bwAffinity; bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes); bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes; pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx)); } for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { pthread_join(bandwidthTestData[threadIdx].handle, NULL); } // Run bandwidth threads for a few seconds and get results gettimeofday(&startTv, &startTz); for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx)); } sleep(3); flag = 1; for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { pthread_join(bandwidthTestData[threadIdx].handle, NULL); } gettimeofday(&endTv, &endTz); uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float totalReadData = 0; for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { free(bandwidthTestData[threadIdx].arr); totalReadData += (float)bandwidthTestData[threadIdx].read_bytes; } float measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; free(bandwidthTestData); return measuredBw; } // returns latency in ns // sets measuredBw = measured bandwidth float RunTest(cpu_set_t latencyAffinity, cpu_set_t bwAffinity, int bwThreadCount, int hugepages, int sharedLatency, float *measuredBw) { uint64_t perThreadArrSizeBytes = ceil((double)BandwidthTestMemoryKB / (double)bwThreadCount) * 1024; volatile int flag = 0; // set 1 to stop struct timeval startTv, endTv; struct timezone startTz, endTz; int map_failed = 0; // MT bw test array fill struct BandwidthTestThreadData *bandwidthTestData = (struct BandwidthTestThreadData *)malloc(sizeof(struct BandwidthTestThreadData) * bwThreadCount); for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { bandwidthTestData[threadIdx].read_bytes = 0; bandwidthTestData[threadIdx].test_method = testMethod; bandwidthTestData[threadIdx].flag = &flag; bandwidthTestData[threadIdx].cpuset = bwAffinity; if (!sharedLatency) { bandwidthTestData[threadIdx].arr = (char *)malloc(perThreadArrSizeBytes); bandwidthTestData[threadIdx].arr_length_bytes = perThreadArrSizeBytes; pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, FillBandwidthTestArr, (void *)(bandwidthTestData + threadIdx)); } } // set up latency test uint32_t *latencyArr; latencyArr = mmap(NULL, LatencyTestMemoryKB * 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (latencyArr == (void *)-1) { // MAP_FAILED fprintf(stderr, "Failed to map hugepages arr, will use madvise\n"); if (0 != posix_memalign((void **)(&latencyArr), 64, LatencyTestMemoryKB * 1024)) { fprintf(stderr, "Failed to allocate %lu KB of memory for latency test\n", LatencyTestMemoryKB); return 0.0f; } madvise(latencyArr, LatencyTestMemoryKB * 1024, MADV_HUGEPAGE); map_failed = 1; } struct LatencyTestData latencyTestData; latencyTestData.iterations = LatencyTestIterations; latencyTestData.latency = 0.0f; latencyTestData.cpuset = latencyAffinity; latencyTestData.arr = latencyArr; FillPatternArr(latencyArr, (LatencyTestMemoryKB * 256), CACHELINE_SIZE); // let bw array fills finish for (int threadIdx = 0; threadIdx < bwThreadCount && !sharedLatency; threadIdx++) { pthread_join(bandwidthTestData[threadIdx].handle, NULL); } // use one array for all bw test threads. latency test size applies across bw threads if (sharedLatency) { for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { bandwidthTestData[threadIdx].arr = (char *)latencyArr; bandwidthTestData[threadIdx].arr_length_bytes = LatencyTestMemoryKB * 1024; } } gettimeofday(&startTv, &startTz); // start bw test threads for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { pthread_create(&(bandwidthTestData[threadIdx].handle), NULL, ReadBandwidthTestThread, (void *)(bandwidthTestData + threadIdx)); } pthread_create(&(latencyTestData.handle), NULL, RunLatencyTest, (void *)&latencyTestData); pthread_join(latencyTestData.handle, NULL); flag = 1; for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { pthread_join(bandwidthTestData[threadIdx].handle, NULL); } gettimeofday(&endTv, &endTz); // count on a cacheline basis even though the test only loads 4B at a time uint64_t latencyReadBytes = 64 * LatencyTestIterations; uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); float totalReadData = (float)latencyReadBytes; for (int threadIdx = 0; threadIdx < bwThreadCount; threadIdx++) { if (!sharedLatency) free(bandwidthTestData[threadIdx].arr); totalReadData += (float)bandwidthTestData[threadIdx].read_bytes; } *measuredBw = 1000 * (totalReadData / (float)1e9) / (float)time_diff_ms; free(bandwidthTestData); if (map_failed) free(latencyArr); else munmap(latencyArr, LatencyTestMemoryKB * 1024); return latencyTestData.latency; } void FillPatternArr(uint32_t *pattern_arr, uint32_t list_size, uint32_t byte_increment) { uint32_t increment = byte_increment / sizeof(uint32_t); uint32_t element_count = list_size / increment; for (int i = 0; i < element_count; i++) { pattern_arr[i * increment] = i * increment; } int iter = element_count; while (iter > 1) { iter -= 1; int j = iter - 1 == 0 ? 0 : rand() % (iter - 1); uint32_t tmp = pattern_arr[iter * increment]; pattern_arr[iter * increment] = pattern_arr[j * increment]; pattern_arr[j * increment] = tmp; } } // No need for simple addressing because this test should be operating well in DRAM // where an extra cycle for indexed addressing should not make a big difference // returns load to use latency in nanoseconds // size_kb should be divisible by 2M, or whatever the hugepage size is void *RunLatencyTest(void *param) { struct timeval startTv, endTv; struct timezone startTz, endTz; struct LatencyTestData *testData = (struct LatencyTestData *)param; uint32_t *A = testData->arr; uint32_t iterations = testData->iterations; uint32_t sum = 0, current; // fucking affinity setting does not work int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(testData->cpuset)); if (rc != 0) fprintf(stderr, "Latency thread failed to set affinity\n"); // Run test gettimeofday(&startTv, &startTz); current = A[0]; for (int i = 0; i < iterations; i++) { current = A[current]; sum += current; } gettimeofday(&endTv, &endTz); uint64_t time_diff_ms = 1000 * (endTv.tv_sec - startTv.tv_sec) + ((endTv.tv_usec - startTv.tv_usec) / 1000); testData->latency = 1e6 * (float)time_diff_ms / (float)iterations; if (sum == 0) printf("sum == 0 (?)\n"); } void *FillBandwidthTestArr(void *param) { struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param; float *arr = (float *)bwTestData->arr; uint64_t float_elements = bwTestData->arr_length_bytes / 4; for (int i = 0; i < float_elements;i++) { arr[i] = (i + ((uint64_t)arr & 0x3)) + 0.2f; } } void *ReadBandwidthTestThread(void *param) { struct BandwidthTestThreadData *bwTestData = (struct BandwidthTestThreadData *)param; int rc = sched_setaffinity(0, sizeof(cpu_set_t), &(bwTestData->cpuset)); if (rc != 0) { fprintf(stderr, "BW test thread failed to set affinity: %s\n", strerror(errno)); for (int i = 0; i < 8; i++) { if (CPU_ISSET(i, &(bwTestData->cpuset))) fprintf(stderr, "\tCPU %d is set\n", i); else fprintf(stderr, "\tCPU %d is NOT set\n", i); } } uint64_t totalDataBytes; if (bwTestData->test_method == Read) totalDataBytes = asm_read(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle); else if (bwTestData->test_method == Add) totalDataBytes = asm_add(bwTestData->arr, bwTestData->arr_length_bytes, bwTestData->flag, throttle); else fprintf(stderr, "Unsupported test method\n"); bwTestData->read_bytes = totalDataBytes; } ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency_amd64.s ================================================ .global asm_read .global asm_add /* rcx = ptr to array rdx = arr length in bytes r8 = stop flag r9 = throttle factor return bytes read in rax */ asm_read: push %rdi push %rsi push %r10 push %r11 mov %rcx, %rdi xor %rsi, %rsi xor %rax, %rax asm_read_pass_loop: /* load 128B */ movups (%rdi), %xmm0 movups 16(%rdi), %xmm0 movups 32(%rdi), %xmm0 movups 48(%rdi), %xmm0 movups 64(%rdi), %xmm0 movups 80(%rdi), %xmm0 movups 96(%rdi), %xmm0 movups 112(%rdi), %xmm0 add $128, %rdi add $128, %rsi add $128, %rax test %r9, %r9 jz asm_read_throttle_end mov %r9, %r10 asm_read_throttle: dec %r10 jnz asm_read_throttle asm_read_throttle_end: /* check stop flag */ mov (%r8), %r10d test %r10d, %r10d jnz asm_read_end cmp %rsi, %rdx jg asm_read_pass_loop mov %rcx, %rdi xor %rsi, %rsi jmp asm_read_pass_loop asm_read_end: pop %r11 pop %r10 pop %rsi pop %rdi ret asm_add: push %rdi push %rsi push %r10 push %r11 mov %rcx, %rdi xor %rsi, %rsi xor %rax, %rax movups (%rdi), %xmm0 asm_add_pass_loop: /* load 128B */ movups %xmm0, %xmm1 addps (%rdi), %xmm1 movups %xmm1, (%rdi) movups %xmm0, %xmm1 addps 16(%rdi), %xmm1 movups %xmm1, 16(%rdi) movups %xmm0, %xmm1 addps 32(%rdi), %xmm1 movups %xmm1, 32(%rdi) movups %xmm0, %xmm1 addps 48(%rdi), %xmm1 movups %xmm1, 32(%rdi) movups %xmm0, %xmm1 addps 64(%rdi), %xmm1 movups %xmm1, 64(%rdi) addps 80(%rdi), %xmm1 addps 96(%rdi), %xmm1 addps 112(%rdi), %xmm1 add $128, %rdi add $128, %rsi add $128, %rax test %r9, %r9 jz asm_add_throttle_end mov %r9, %r10 asm_add_throttle: dec %r10 jnz asm_add_throttle asm_add_throttle_end: /* check stop flag */ mov (%r8), %r10d test %r10d, %r10d jnz asm_add_end cmp %rsi, %rdx jg asm_add_pass_loop mov %rcx, %rdi xor %rsi, %rsi jmp asm_add_pass_loop asm_add_end: pop %r11 pop %r10 pop %rsi pop %rdi shl $1, %rax /* count rmw as 2 */ ret ================================================ FILE: LoadedMemoryLatency/LoadedMemoryLatency_arm.s ================================================ .global asm_read .global _asm_read .global asm_add .global _asm_add /* x0 = ptr to array x1 = arr length in bytes x2 = stop flag x3 = throttle factor return bytes read in x0 */ _asm_read: asm_read: sub sp, sp, #0x40 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x11, x10, [sp, #0x30] sub x1, x1, 128 mov x15, x0 /* ptr into array */ mov x12, 0 /* current offset into array */ mov x13, 0 /* data transferred in bytes */ asm_read_pass_loop: /* load 128B */ ldr q16, [x15] ldr q16, [x15, 16] ldr q16, [x15, 32] ldr q16, [x15, 48] ldr q16, [x15, 64] ldr q16, [x15, 80] ldr q16, [x15, 96] ldr q16, [x15, 112] add x12, x12, 128 add x15, x15, 128 add x13, x13, 128 cbz x3, asm_read_throttle_end mov x10, x3 /* save throttle factor */ asm_read_throttle: sub x10, x10, 1 cbnz x10, asm_read_throttle asm_read_throttle_end: /* end condition */ ldr w14, [x2] cbnz x14, asm_read_end /* loop back condition */ cmp x1, x12 b.gt asm_read_pass_loop mov x15, x0 mov x12, 0 b asm_read_pass_loop asm_read_end: mov x0, x13 ldp x11, x10, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x40 ret _asm_add: asm_add: sub sp, sp, #0x40 stp x14, x15, [sp, #0x10] stp x12, x13, [sp, #0x20] stp x11, x10, [sp, #0x30] sub x1, x1, 128 mov x15, x0 /* ptr into array */ mov x12, 0 /* current offset into array */ mov x13, 0 /* data transferred in bytes */ ldr q15, [x15] asm_add_pass_loop: /* load 128B */ ldr q16, [x15] ldr q16, [x15, 16] add v16.4s, v16.4s, v15.4s str q16, [x15, 16] ldr q16, [x15, 32] add v16.4s, v16.4s, v15.4s str q16, [x15, 32] ldr q16, [x15, 48] add v16.4s, v16.4s, v15.4s str q16, [x15, 48] ldr q16, [x15, 64] add v16.4s, v16.4s, v15.4s str q16, [x15, 64] ldr q16, [x15, 80] add v16.4s, v16.4s, v15.4s str q16, [x15, 80] ldr q16, [x15, 96] add v16.4s, v16.4s, v15.4s str q16, [x15, 96] ldr q16, [x15, 112] add v16.4s, v16.4s, v15.4s str q16, [x15, 112] add x12, x12, 128 add x15, x15, 128 add x13, x13, 256 cbz x3, asm_add_throttle_end mov x10, x3 /* save throttle factor */ asm_add_throttle: sub x10, x10, 1 cbnz x10, asm_add_throttle asm_add_throttle_end: /* end condition */ ldr w14, [x2] cbnz x14, asm_add_end /* loop back condition */ cmp x1, x12 b.gt asm_add_pass_loop mov x15, x0 mov x12, 0 b asm_add_pass_loop asm_add_end: mov x0, x13 ldp x11, x10, [sp, #0x30] ldp x12, x13, [sp, #0x20] ldp x14, x15, [sp, #0x10] add sp, sp, #0x40 ret ================================================ FILE: LoadedMemoryLatency/Makefile ================================================ amd64: gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_amd64.s -o loadedlat_amd64 -lm aarch64: gcc -O3 LoadedMemoryLatency.c LoadedMemoryLatency_arm.s -o loadedlat_aarch64 -lm ================================================ FILE: Makefile ================================================ include Common/arch_detect.mk COMPONENTS = CoherencyLatency MemoryLatency MemoryBandwidth InstructionRate Meshsim CoreClockChecker GpuMemLatency all: $(COMPONENTS) ci: for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT ci; done package: @sh Common/ci_package.sh clean-package: find . -maxdepth 1 -type d -name "clammarks-*" -exec rm -rf {} \; && rm -f "clammarks.txz" clean: for COMPONENT in $(COMPONENTS); do $(MAKE) -C $$COMPONENT clean; done $(COMPONENTS): .FORCE $(MAKE) -C $@ .FORCE: .PHONY: all ci package clean-package clean ================================================ FILE: MemoryBandwidth/Makefile ================================================ include ../Common/arch_detect.mk CFLAGS = -pthread -O3 LDFLAGS= -lm all: $(TARGET) amd64: $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_amd64 $(LDFLAGS) amd64-numa: $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_numa_amd64 $(LDFLAGS) -lnuma aarch64: $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 $(LDFLAGS) termux: gcc -O3 -pthread MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_aarch64 -lm aarch64-numa: $(CC) $(CFLAGS) -DNUMA MemoryBandwidth.c MemoryBandwidth_arm.s -o MemoryBandwidth_numa_aarch64 $(LDFLAGS) -lnuma riscv64: $(CC) $(CFLAGS) -march=rv64gcv0p7 MemoryBandwidth.c MemoryBandwidth_riscv.s -o MemoryBandwidth_riscv64 $(LDFLAGS) w64: $(CC) $(CFLAGS) MemoryBandwidth.c MemoryBandwidth_x86.s -o MemoryBandwidth_w64.exe $(LDFLAGS) ci: amd64 amd64-numa aarch64 w64 clean: rm -f *.o && find . -type f -executable -delete .PHONY: all ci clean ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.cpp ================================================ // MemoryBandwidth.cpp : This file contains the 'main' function. Program execution begins and ends there. // #include #include #include #include #ifdef __MINGW32__ #include #else #include #endif #include #include #include #include #define NUMA_STRIPE 1 #define NUMA_SEQ 2 #define NUMA_CROSSNODE 3 #define NUMA_AUTO 4 #ifdef _WIN64 int default_test_sizes[39] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, 131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 }; #else int default_test_sizes[35] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 512, 600, 768, 1024, 1536, 2048, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 16384, 24567, 32768, 65536, 98304, 131072, 262144, 393216, 524288 }; #endif enum NopType { None, FourByte, EightByte, K8_FourByte, Branch16, TenByte, LEA }; struct BandwidthTestThreadData { uint32_t iterations; uint32_t arr_length; float* arr; float bw; // written to by the thread }; #ifdef _WIN64 uint32_t dataGb = 512; #else uint32_t dataGb = 96; #endif //__int32 dataGb = 32; // array length = number of 4 byte elements float _fastcall scalar_read(void* arr, uint32_t arr_length, uint32_t iterations); #ifdef _WIN64 extern "C" float sse_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float sse_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float sse_asm_ntwrite(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float sse_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float sse_asm_add(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx_asm_copy(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx_asm_cflip(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx_asm_add(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float avx512_asm_read(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float repmovsb_copy(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float repstosb_write(void* arr, uint64_t arr_length, uint64_t iterations); extern "C" float clzero_asm_write(void* arr, uint64_t arr_length, uint64_t iterations); float (*bw_func)(void*, uint64_t, uint64_t) = sse_asm_read; #else extern "C" float __fastcall scalar_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations); extern "C" float __fastcall mmx_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations); extern "C" float __fastcall sse_asm_read32(void* arr, uint32_t arr_length, uint32_t iterations); extern "C" float __fastcall dummy(void* arr, uint32_t arr_length, uint32_t iterations); float(_fastcall *bw_func)(void*, uint32_t, uint32_t) = dummy; #endif float MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr); float MeasureInstructionBw(uint64_t sizeKb, uint64_t iterations, enum NopType nopSize, uint32_t threads, int shared); void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize); #ifdef _WIN64 float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations); #else float __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations); #endif void PrintNumaInfo(); uint32_t GetIterationCount(uint32_t testSize, uint32_t threads); DWORD WINAPI ReadBandwidthTestThread(LPVOID param); int numa = 0; char coreNode, memNode; char GetSeqNode(uint64_t); char GetStripeNode(uint64_t); int main(int argc, char *argv[]) { int threads = 1, shared = 0, methodSet = 0; enum NopType instr = None; int cpuid_data[4]; int singleSize = 0; if (argc == 1) { printf("Usage: [-threads ] [-method ] [-shared] [-private] [-data ]\n", dataGb); } for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char* arg = argv[argIdx] + 1; if (_strnicmp(arg, "threads", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); fprintf(stderr, "Using %d threads\n", threads); } else if (_strnicmp(arg, "shared", 6) == 0) { shared = 1; fprintf(stderr, "Using one array shared across all threads\n"); } else if (_strnicmp(arg, "private", 7) == 0) { shared = 0; fprintf(stderr, "Using private array for each thread\n"); } else if (_strnicmp(arg, "method", 6) == 0) { methodSet = 1; argIdx++; #ifdef _WIN64 if (_strnicmp(argv[argIdx], "read_asm_sse", 7) == 0) { bw_func = sse_asm_read; fprintf(stderr, "Using SSE assembly\n"); } else if (_strnicmp(argv[argIdx], "read_asm_avx512", 10) == 0) { bw_func = avx512_asm_read; fprintf(stderr, "Using AVX512 assembly\n"); } else if (_strnicmp(argv[argIdx], "write_asm_avx", 14) == 0) { bw_func = avx_asm_write; fprintf(stderr, "Using AVX assembly, writing instead of reading\n"); } else if (_strnicmp(argv[argIdx], "read_asm_avx", 12) == 0) { bw_func = avx_asm_read; fprintf(stderr, "Using AVX assembly\n"); } else if (_strnicmp(argv[argIdx], "copy_asm_avx", 12) == 0) { bw_func = avx_asm_copy; fprintf(stderr, "Using AVX assembly, copying one half of array to the other\n"); } else if (_strnicmp(argv[argIdx], "cflip_asm_avx", 13) == 0) { bw_func = avx_asm_cflip; fprintf(stderr, "Using AVX assembly, flipping order of vec sized elements within a cacheline\n"); } else if (_strnicmp(argv[argIdx], "add_asm_avx", 11) == 0) { bw_func = avx_asm_add; fprintf(stderr, "Using AVX assembly, adding constant to array\n"); } else if (_strnicmp(argv[argIdx], "copy_asm_sse", 12) == 0) { bw_func = sse_asm_copy; fprintf(stderr, "Using SSE assembly, copying one half of array to the other\n"); } else if (_strnicmp(argv[argIdx], "write_asm_sse", 13) == 0) { bw_func = sse_asm_write; fprintf(stderr, "Using SSE assembly, writing\n"); } else if (_strnicmp(argv[argIdx], "ntwrite_asm_sse", 13) == 0) { bw_func = sse_asm_ntwrite; fprintf(stderr, "Using SSE assembly, non-temporal writes\n"); } else if (_strnicmp(argv[argIdx], "add_asm_sse", 11) == 0) { bw_func = sse_asm_add; fprintf(stderr, "Using SSE assembly, adding constant to array\n"); } else if (_strnicmp(argv[argIdx], "copy_repmovsb", 11) == 0) { bw_func = repmovsb_copy; fprintf(stderr, "Using assembly, rep movsb to copy one half of the array to the other\n"); } else if (_strnicmp(argv[argIdx], "write_repstosb", 11) == 0) { bw_func = repstosb_write; fprintf(stderr, "Using assembly, rep stosb to set array contents to 1\n"); } else if (_strnicmp(argv[argIdx], "clzero", 11) == 0) { bw_func = clzero_asm_write; fprintf(stderr, "Using assembly, clzero to set array contents to 0\n"); } #else if (_strnicmp(argv[argIdx], "scalar", 6) == 0) { bw_func = scalar_asm_read32; fprintf(stderr, "Using scalar MOV r <- mem32\n"); } else if (_strnicmp(argv[argIdx], "sse", 3) == 0) { bw_func = sse_asm_read32; fprintf(stderr, "Using SSE MOVAPS xmm <- mem128\n"); } else if (_strnicmp(argv[argIdx], "mmx", 3) == 0) { bw_func = mmx_asm_read32; fprintf(stderr, "Using MMX MOVQ mm <- mem64\n"); } #endif else if (_strnicmp(argv[argIdx], "instr8", 6) == 0) { instr = EightByte; fprintf(stderr, "Using 8B NOPs\n"); } else if (_strnicmp(argv[argIdx], "instr4", 6) == 0) { instr = FourByte; fprintf(stderr, "Using 4B NOPs\n"); } else if (_strnicmp(argv[argIdx], "instrk8_4", 6) == 0) { instr = K8_FourByte; fprintf(stderr, "Using 4B NOPs, with encoding recommended in the Athlon optimization manual\n"); } else if (_strnicmp(argv[argIdx], "instr_lea", 6) == 0) { instr = LEA; fprintf(stderr, "Using LEA\n"); } else if (_strnicmp(argv[argIdx], "branch16", 6) == 0) { instr = Branch16; fprintf(stderr, "Using branch per 16B\n"); } else if (_strnicmp(argv[argIdx], "instr10", 7) == 0) { instr = TenByte; fprintf(stderr, "Using 10B NOPs\n"); } else { methodSet = 0; fprintf(stderr, "I'm so confused. Gonna use whatever the CPU supports I guess\n"); } } else if (_strnicmp(arg, "data", 4) == 0) { argIdx++; dataGb = atoi(argv[argIdx]); fprintf(stderr, "Base data to transfer: %u\n", dataGb); } else if (_strnicmp(arg, "printnumainfo", 8) == 0) { fprintf(stderr, "Printing NUMA info and exiting\n"); PrintNumaInfo(); return 0; } else if (_strnicmp(arg, "numa", 4) == 0) { argIdx++; fprintf(stderr, "Attempting to be NUMA aware\n"); numa = NUMA_SEQ; if (_strnicmp(argv[argIdx], "stripe", 6) == 0) { numa = NUMA_STRIPE; } else if (_strnicmp(argv[argIdx], "seq", 3) == 0) { numa = NUMA_SEQ; } if (numa == NUMA_SEQ) fprintf(stderr, "Filling NUMA nodes one by one\n"); else if (numa == NUMA_STRIPE) fprintf(stderr, "Striping threads across NUMA nodes\n"); } else if (_strnicmp(arg, "autonuma", 8) == 0) { numa = NUMA_AUTO; } else if (_strnicmp(arg, "crossnode", 9) == 0) { numa = NUMA_CROSSNODE; argIdx++; coreNode = atoi(argv[argIdx]); argIdx++; memNode = atoi(argv[argIdx]); fprintf(stderr, "Testing %d -> %d\n", coreNode, memNode); } else if (_strnicmp(arg, "singlesize", 10) == 0) { argIdx++; singleSize = atoi(argv[argIdx]); fprintf(stderr, "Testing %d KB\n", singleSize); } } } if (!methodSet) { // cpuid_data[0] = eax, [1] = ebx, [2] = ecx, [3] = edx __cpuidex(cpuid_data, 1, 0); #ifdef _WIN64 // EDX bit 25 = SSE if (cpuid_data[3] & (1UL << 25)) { fprintf(stderr, "SSE supported\n"); bw_func = sse_asm_read; } if (cpuid_data[2] & (1UL << 28)) { fprintf(stderr, "AVX supported\n"); bw_func = avx_asm_read; } __cpuidex(cpuid_data, 7, 0); if (cpuid_data[1] & (1UL << 16)) { fprintf(stderr, "AVX512 supported\n"); bw_func = avx512_asm_read; } #else int choice = 0; printf("Pick a method. Choose wisely:\n"); printf("1. SSE movaps xmm <- mem128"); if (cpuid_data[3] & (1UL << 25)) printf(" (looks supported)\n"); else printf(" (looks unsupported)\n"); printf("2. MMX movq mm <- mem64"); if (cpuid_data[3] & (1UL << 23)) printf(" (looks supported)\n"); else printf(" (looks unsupported\n"); printf("3. mov gpr <- mem32 (better work)\n"); printf("4. instruction side, 8B NOPs (0F 1F 84 00 00 00 00 00)\n"); printf("5. instruction side, 4B NOPs (0F 1F 40 00)\n"); printf("6. instruction side, 4B NOPs (66 66 66 90)\n"); printf("Your choice: "); scanf_s("%d", &choice); if (choice == 1) bw_func = sse_asm_read32; else if (choice == 2) bw_func = mmx_asm_read32; else if (choice == 3) bw_func = scalar_asm_read32; else if (choice == 4) instr = EightByte; else if (choice == 5) instr = FourByte; else if (choice == 6) instr = K8_FourByte; else { printf("Bye\n"); return 0; } #endif } if (instr) { bw_func = instr_read; } if (singleSize) { float bw = MeasureBw(singleSize, GetIterationCount(singleSize, threads), threads, shared, instr); printf("%d,%f\n", singleSize, bw); } else if (numa == NUMA_AUTO) { ULONG highestNumaNode; if (!GetNumaHighestNodeNumber(&highestNumaNode)) { fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError()); return 0; } for (int coreNode = 0; coreNode <= highestNumaNode; coreNode++) printf(",%d", coreNode); printf("\n"); for (int coreNodeIdx = 0; coreNodeIdx <= highestNumaNode; coreNodeIdx++) { printf("%d", coreNodeIdx); for (int memNodeIdx = 0; memNodeIdx <= highestNumaNode; memNodeIdx++) { ULONGLONG mask; DWORD index; coreNode = coreNodeIdx; memNode = memNodeIdx; numa = NUMA_CROSSNODE; // hacky, oh well float bw = MeasureBw( default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1], GetIterationCount(default_test_sizes[(sizeof(default_test_sizes) / sizeof(int)) - 1], threads), threads, shared, instr); printf(",%f", bw); } printf("\n"); } } else { printf("Using %d threads\n", threads); for (int i = 0; i < sizeof(default_test_sizes) / sizeof(int); i++) { float bw = MeasureBw(default_test_sizes[i], GetIterationCount(default_test_sizes[i], threads), threads, shared, instr); if (bw > 0) printf("%d,%f\n", default_test_sizes[i], bw); } } return 0; } /// /// Given test size in KB, return a good iteration count /// /// test size in KB /// Iterations per thread uint32_t GetIterationCount(uint32_t testSize, uint32_t threads) { uint32_t gbToTransfer = dataGb; if (testSize > 64) gbToTransfer = dataGb / 2; if (testSize > 512) gbToTransfer = dataGb / 4; if (testSize > 8192) gbToTransfer = dataGb / 8; uint32_t iterations = gbToTransfer * 1024 * 1024 / testSize; if (iterations % 2 != 0) iterations += 1; if (iterations < 8) return 8; // set a minimum to reduce noise else return iterations; } float MeasureBw(uint32_t sizeKb, uint32_t iterations, uint32_t threads, int shared, enum NopType instr) { struct timeb start, end; float bw = 0; uint32_t elements = sizeKb * 1024 / sizeof(float); uint32_t private_elements = ceil((double)sizeKb / (double)threads) * 256; DWORD protection_flags = PAGE_EXECUTE_READWRITE; //if (instr != None) protection_flags = PAGE_EXECUTE_READWRITE; if (!shared) elements = private_elements; //fprintf(stderr, "%llu elements per thread\n", elements); if (!shared && sizeKb < threads) { //fprintf(stderr, "Too many threads for this size, continuing\n"); return 0; } // make array and fill it with something float* testArr = NULL; if (shared) { testArr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags); if (testArr == NULL) { fprintf(stderr, "Could not allocate memory\n"); return 0; } if (instr != None) { FillInstructionArray((uint64_t*)testArr, sizeKb, instr); } else { for (uint32_t i = 0; i < elements; i++) { testArr[i] = i + 0.5f; } } } HANDLE* testThreads = (HANDLE*)malloc(threads * sizeof(HANDLE)); DWORD* tids = (DWORD*)malloc(threads * sizeof(DWORD)); struct BandwidthTestThreadData* threadData = (struct BandwidthTestThreadData*)malloc(threads * sizeof(struct BandwidthTestThreadData)); for (uint64_t i = 0; i < threads; i++) { char node; if (shared) { threadData[i].arr = testArr; threadData[i].iterations = iterations; } else { if (!numa) threadData[i].arr = (float*)VirtualAlloc(NULL, elements * sizeof(float), MEM_COMMIT | MEM_RESERVE, protection_flags); else if (numa == NUMA_STRIPE) { node = GetStripeNode(i); threadData[i].arr = (float *)VirtualAllocExNuma( GetCurrentProcess(), NULL, elements * sizeof(float), MEM_RESERVE | MEM_COMMIT, protection_flags, node ); } else if (numa == NUMA_SEQ) { node = GetSeqNode(i); threadData[i].arr = (float*)VirtualAllocExNuma( GetCurrentProcess(), NULL, elements * sizeof(float), MEM_RESERVE | MEM_COMMIT, protection_flags, node ); } else if (numa == NUMA_CROSSNODE) { threadData[i].arr = (float*)VirtualAllocExNuma( GetCurrentProcess(), NULL, elements * sizeof(float), MEM_RESERVE | MEM_COMMIT, protection_flags, memNode ); node = memNode; } if (threadData[i].arr == NULL) { fprintf(stderr, "Could not allocate memory for thread %llu\n", i); return 0; } if (instr != None) { FillInstructionArray((uint64_t*)threadData[i].arr, (elements * 4) / 1024, instr); } else { for (uint64_t arr_idx = 0; arr_idx < elements; arr_idx++) { threadData[i].arr[arr_idx] = arr_idx + i + 0.5f; } } threadData[i].iterations = iterations * threads; } threadData[i].arr_length = elements; threadData[i].bw = 0; testThreads[i] = CreateThread(NULL, 0, ReadBandwidthTestThread, threadData + i, CREATE_SUSPENDED, tids + i); // turns out setting affinity makes no difference, and it's easier to set affinity via start /affinity anyway //SetThreadAffinityMask(testThreads[i], 1UL << i); if (numa == NUMA_STRIPE || numa == NUMA_SEQ) { ULONGLONG mask; //fprintf(stderr, "Thread %d pinned to node %d\n", i, node); GetNumaNodeProcessorMask(node, &mask); SetThreadAffinityMask(testThreads[i], mask); } else if (numa == NUMA_CROSSNODE) { ULONGLONG mask; GetNumaNodeProcessorMask(coreNode, &mask); SetThreadAffinityMask(testThreads[i], mask); } } ftime(&start); for (uint32_t i = 0; i < threads; i++) ResumeThread(testThreads[i]); WaitForMultipleObjects((DWORD)threads, testThreads, TRUE, INFINITE); ftime(&end); int64_t time_diff_ms = 1000 * (end.time - start.time) + (end.millitm - start.millitm); double gbTransferred = (uint64_t)iterations * sizeof(float) * elements * threads / (double)1e9; bw = 1000 * gbTransferred / (double)time_diff_ms; if (!shared) bw = bw * threads; //printf("%u iterations\n", iterations); //printf("%f GB, %lu ms\n", gbTransferred, time_diff_ms); free(testThreads); if (shared) VirtualFree(testArr, elements * sizeof(float), MEM_RELEASE); free(tids); if (!shared) { for (int i = 0; i < threads; i++) { VirtualFreeEx(GetCurrentProcess(), threadData[i].arr, 0, MEM_RELEASE); } } free(threadData); return bw; } void FillInstructionArray(uint64_t* arr, uint64_t sizeKb, enum NopType nopSize) { char nop8b[8] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; // zen/piledriver optimization manual uses this pattern char nop4b[8] = { 0x0F, 0x1F, 0x40, 0x00, 0x0F, 0x1F, 0x40, 0x00 }; // athlon64 (K8) optimization manual pattern char k8_nop4b[8] = { 0x66, 0x66, 0x66, 0x90, 0x66, 0x66, 0x66, 0x90 }; char lea[8] = { 0x48, 0x8D, 0x04, 0x4B, 0x66, 0x0F, 0xEF, 0xC0 }; char nop10b[10] = { 0x66, 0x66, 0xf, 0x1f, 0x84, 0, 0, 0, 0, 0 }; uint64_t elements = (sizeKb * 1024 / 8) - 1; // leave room for ret unsigned char* functionEnd = (unsigned char*)(arr + elements); if (nopSize != Branch16 && nopSize != TenByte) { uint64_t* nopPtr; if (nopSize == EightByte) nopPtr = (uint64_t*)(nop8b); else if (nopSize == FourByte) nopPtr = (uint64_t*)(nop4b); else if (nopSize == K8_FourByte) nopPtr = (uint64_t*)(k8_nop4b); else if (nopSize == LEA) nopPtr = (uint64_t*)(lea); else { fprintf(stderr, "%d (enum value) NOP size isn't supported :(\n", nopSize); return; } for (uint64_t nopIdx = 0; nopIdx < elements; nopIdx++) { arr[nopIdx] = *nopPtr; } functionEnd[0] = 0xC3; } else if (nopSize == TenByte) { char* targetArr = (char*)arr; uint64_t targetArrLenBytes = sizeKb * 1024 - 2; // leave room for ret int targetArrIdx; for (targetArrIdx = 0; targetArrIdx + 10 < targetArrLenBytes; targetArrIdx += 10) { memcpy(targetArr + targetArrIdx, nop10b, 10); } targetArr[targetArrIdx] = 0xC3; } else if (nopSize == Branch16) { // jump forward 14 bytes char branch16b[8] = { 0xEB, 0x0E, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; char ret8b[8] = { 0xC3, 0, 0, 0, 0, 0, 0, 0 }; uint64_t *branchPtr = (uint64_t*)(branch16b); uint64_t* nopPtr = (uint64_t*)(nop8b); // doesn't really matter, we should never hit this // last iteration must have nopIdx % 2 == 1, so the jump will go to the return statement // i.e. branchElements for loop must be even, so the last iteration is odd uint64_t branchElements = elements % 2 == 0 ? elements : elements - 1; uint64_t nopIdx; for (nopIdx = 0; nopIdx < branchElements; nopIdx++) { arr[nopIdx] = nopIdx % 2 == 0 ? *branchPtr : *nopPtr; } arr[nopIdx] = *(uint64_t*)ret8b; } } #ifdef _WIN64 float __fastcall instr_read(void* arr, uint64_t arr_length, uint64_t iterations) #else float __fastcall instr_read(void* arr, uint32_t arr_length, uint32_t iterations) #endif { void (*nopfunc)(uint64_t); nopfunc = (void(*)(uint64_t))arr; int iterIdx; for (iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations); return iterIdx; } float __fastcall scalar_read(void* a, uint32_t arr_length, uint32_t iterations) { float sum = 0; if (16 >= arr_length) return 0; uint32_t iter_idx = 0, i = 0; float s1 = 0, s2 = 1, s3 = 0, s4 = 1, s5 = 0, s6 = 1, s7 = 0, s8 = 1; float* arr = (float*)a; while (iter_idx < iterations) { s1 += arr[i]; s2 *= arr[i + 1]; s3 += arr[i + 2]; s4 *= arr[i + 3]; s5 += arr[i + 4]; s6 *= arr[i + 5]; s7 += arr[i + 6]; s8 *= arr[i + 7]; i += 8; if (i + 7 >= arr_length) i = 0; if (i == 0) iter_idx++; } sum += s1 + s2 + s3 + s4 + s5 + s6 + s7 + s8; return sum; } // return sum of array float sse_read(float* arr, uint64_t arr_length, uint64_t iterations) { float sum = 0; float iterSum = 0; // zero two sums __m128 s1 = _mm_setzero_ps(); __m128 s2 = _mm_setzero_ps(); __m128 s3 = _mm_loadu_ps(arr); __m128 s4 = _mm_loadu_ps(arr); __m128 s5 = _mm_setzero_ps(); __m128 s6 = _mm_setzero_ps(); __m128 s7 = _mm_loadu_ps(arr); __m128 s8 = _mm_loadu_ps(arr); __m128 zero = _mm_setzero_ps(); uint64_t iter_idx = 0, i = 0; while (iter_idx < iterations) { __m128 e1 = _mm_loadu_ps(arr + i); __m128 e2 = _mm_loadu_ps(arr + i + 4); __m128 e3 = _mm_loadu_ps(arr + i + 8); __m128 e4 = _mm_loadu_ps(arr + i + 12); __m128 e5 = _mm_loadu_ps(arr + i + 16); __m128 e6 = _mm_loadu_ps(arr + i + 20); __m128 e7 = _mm_loadu_ps(arr + i + 24); __m128 e8 = _mm_loadu_ps(arr + i + 28); s1 = _mm_add_ps(s1, e1); s2 = _mm_add_ps(s2, e2); s3 = _mm_mul_ps(s3, e3); s4 = _mm_mul_ps(s4, e4); s5 = _mm_add_ps(s5, e5); s6 = _mm_add_ps(s6, e6); s7 = _mm_mul_ps(s7, e7); s8 = _mm_mul_ps(s8, e8); i += 32; if (i + 31 >= arr_length) i = 0; if (i == 0) iter_idx++; } iterSum = _mm_cvtss_f32(s1) + _mm_cvtss_f32(s2) + _mm_cvtss_f32(s3) + _mm_cvtss_f32(s4) + _mm_cvtss_f32(s5) + _mm_cvtss_f32(s6) + _mm_cvtss_f32(s7) + _mm_cvtss_f32(s8); sum = iterSum; return sum; } #ifdef _WIN64 float avx_read(float* arr, uint64_t arr_length, uint64_t iterations) { float sum = 0; float iterSum = 0; __m256 s1 = _mm256_setzero_ps(); __m256 s2 = _mm256_loadu_ps(arr); __m256 s3 = _mm256_setzero_ps(); __m256 s4 = _mm256_loadu_ps(arr); __m256 s5 = _mm256_loadu_ps(arr); __m256 s6 = _mm256_loadu_ps(arr); __m256 s7 = _mm256_loadu_ps(arr); __m256 s8 = _mm256_loadu_ps(arr); uint64_t iter_idx = 0, i = 0; while (iter_idx < iterations) { __m256 e1 = _mm256_loadu_ps(arr + i); __m256 e2 = _mm256_loadu_ps(arr + i + 8); __m256 e3 = _mm256_loadu_ps(arr + i + 16); __m256 e4 = _mm256_loadu_ps(arr + i + 24); __m256 e5 = _mm256_loadu_ps(arr + i + 32); __m256 e6 = _mm256_loadu_ps(arr + i + 40); __m256 e7 = _mm256_loadu_ps(arr + i + 48); __m256 e8 = _mm256_loadu_ps(arr + i + 56); s1 = _mm256_add_ps(s1, e1); s2 = _mm256_mul_ps(s2, e2); s3 = _mm256_add_ps(s3, e3); s4 = _mm256_mul_ps(s4, e4); s5 = _mm256_mul_ps(s5, e5); s6 = _mm256_mul_ps(s6, e6); s7 = _mm256_mul_ps(s7, e7); s8 = _mm256_mul_ps(s8, e8); i += 64; if (i + 63 >= arr_length) i = 0; if (i == 0) iter_idx++; } // sink the result somehow iterSum = _mm256_cvtss_f32(s1) + _mm256_cvtss_f32(s2) + _mm256_cvtss_f32(s3) + _mm256_cvtss_f32(s4) + _mm256_cvtss_f32(s5) + _mm256_cvtss_f32(s6) + _mm256_cvtss_f32(s7) + _mm256_cvtss_f32(s8); sum = iterSum; return sum; } #endif DWORD WINAPI ReadBandwidthTestThread(LPVOID param) { BandwidthTestThreadData* bwTestData = (BandwidthTestThreadData*)param; float sum = bw_func(bwTestData->arr, bwTestData->arr_length, bwTestData->iterations); if (sum == 0) printf("woohoo\n"); return 0; } void PrintNumaInfo() { ULONG highestNumaNode; DWORD nProcs; SYSTEM_INFO SystemInfo; GetSystemInfo(&SystemInfo); nProcs = SystemInfo.dwNumberOfProcessors; if (!GetNumaHighestNodeNumber(&highestNumaNode)) { fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError()); return; } printf("%d processors, highest NUMA node is %lu\n", nProcs, highestNumaNode); if (highestNumaNode == 0) { return; } for (int procIdx = 0; procIdx < nProcs; procIdx++) { unsigned char node; GetNumaProcessorNode(procIdx, &node); printf("Processor %d is on node %d\n", procIdx, node); } for (char nodeIdx = 0; nodeIdx <= highestNumaNode; nodeIdx++) { ULONGLONG mask; GetNumaNodeProcessorMask(nodeIdx, &mask); printf("Node %d: %llx\n", nodeIdx, mask); } } char GetStripeNode(uint64_t threadIdx) { ULONG highestNumaNode; if (!GetNumaHighestNodeNumber(&highestNumaNode)) { fprintf(stderr, "Could not get highest NUMA node number: %d\n", GetLastError()); return 0; } return threadIdx % highestNumaNode; } char GetSeqNode(uint64_t threadIdx) { SYSTEM_INFO SystemInfo; GetSystemInfo(&SystemInfo); unsigned int clippedThreadIdx = threadIdx % SystemInfo.dwNumberOfProcessors; unsigned char node; GetNumaProcessorNode(clippedThreadIdx, &node); return node; } ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.sln ================================================  Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio Version 17 VisualStudioVersion = 17.6.33815.320 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MemoryBandwidth", "MemoryBandwidth.vcxproj", "{E968D202-64A2-43A5-8BBD-D7D010D06564}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MixedMemoryBandwidthTest", "..\MixedMemoryBandwidthTest\MixedMemoryBandwidthTest.vcxproj", "{5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 Debug|x86 = Debug|x86 Release|x64 = Release|x64 Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.ActiveCfg = Debug|x64 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x64.Build.0 = Debug|x64 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.ActiveCfg = Debug|Win32 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Debug|x86.Build.0 = Debug|Win32 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.ActiveCfg = Release|x64 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x64.Build.0 = Release|x64 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.ActiveCfg = Release|Win32 {E968D202-64A2-43A5-8BBD-D7D010D06564}.Release|x86.Build.0 = Release|Win32 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.ActiveCfg = Debug|x64 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x64.Build.0 = Debug|x64 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.ActiveCfg = Debug|Win32 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Debug|x86.Build.0 = Debug|Win32 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.ActiveCfg = Release|x64 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x64.Build.0 = Release|x64 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.ActiveCfg = Release|Win32 {5AB9DDE0-C954-4D2F-AA46-BFA87EC585C4}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {2EA86D6F-CEE0-40A9-B4DD-AC59CCAD358F} EndGlobalSection EndGlobal ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj ================================================ Debug Win32 Release Win32 Debug x64 Release x64 16.0 Win32Proj {e968d202-64a2-43a5-8bbd-d7d010d06564} MemoryBandwidth 10.0 Application true v143 Unicode Application false v143 true Unicode Application true v143 Unicode Application false v143 true Unicode true false true false Level3 true WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Level3 true _DEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true Level3 true true true NDEBUG;_CONSOLE;%(PreprocessorDefinitions) true Console true true true Document nasm -f win64 MemoryBandwidthFunctions.asm Running NASM MemoryBandwidthFunctions.obj nasm -f win64 MemoryBandwidthFunctions.asm Running NASM MemoryBandwidthFunctions.obj Document nasm -f win32 MemoryBandwidthFunctions32.asm Running NASM, targeting 32-bit MemoryBandwidthFunctions32.obj ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidth.vcxproj.filters ================================================  {4FC737F1-C7A5-4376-A066-2A32D752A2FF} cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx {93995380-89BD-4b04-88EB-625FBE52EBFB} h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms Source Files Source Files Source Files ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions.asm ================================================ section .text bits 64 global sse_asm_read global sse_asm_copy global sse_asm_write global sse_asm_ntwrite global sse_asm_add global avx_asm_read global avx_asm_write global avx_asm_ntwrite global avx_asm_copy global avx_asm_cflip global avx_asm_add global avx512_asm_read global clzero_asm_write global repmovsb_copy global repstosb_write ; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations, r9 = start index ; return something in xmm0 avx_asm_read: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi avx_asm_read_pass_loop: ; xmm0 to 5 are considered volatile vmovaps ymm0, [rdi] vmovaps ymm1, [rdi + 32] vmovaps ymm2, [rdi + 64] vmovaps ymm3, [rdi + 96] vmovaps ymm0, [rdi + 128] vmovaps ymm1, [rdi + 160] vmovaps ymm2, [rdi + 192] vmovaps ymm3, [rdi + 224] add rsi, 64 add rdi, r15 vmovaps ymm0, [rdi] vmovaps ymm1, [rdi + 32] vmovaps ymm2, [rdi + 64] vmovaps ymm3, [rdi + 96] vmovaps ymm0, [rdi + 128] vmovaps ymm1, [rdi + 160] vmovaps ymm2, [rdi + 192] vmovaps ymm3, [rdi + 224] add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx_test_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx_test_iteration_count: cmp r9, rsi jnz avx_asm_read_pass_loop ; skip iteration decrement if we're not back to start dec r8 jnz avx_asm_read_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret avx_asm_write: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi vmovaps ymm0, [rcx] avx_asm_write_pass_loop: vmovaps [rdi], ymm0 vmovaps [rdi + 32], ymm0 vmovaps [rdi + 64], ymm0 vmovaps [rdi + 96], ymm0 vmovaps [rdi + 128], ymm0 vmovaps [rdi + 160], ymm0 vmovaps [rdi + 192], ymm0 vmovaps [rdi + 224], ymm0 add rsi, 64 add rdi, r15 vmovaps [rdi], ymm0 vmovaps [rdi + 32], ymm0 vmovaps [rdi + 64], ymm0 vmovaps [rdi + 96], ymm0 vmovaps [rdi + 128], ymm0 vmovaps [rdi + 160], ymm0 vmovaps [rdi + 192], ymm0 vmovaps [rdi + 224], ymm0 add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx_write_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx_write_iteration_count: cmp r9, rsi jnz avx_asm_write_pass_loop ; skip iteration decrement if we're not back to start dec r8 jnz avx_asm_write_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret avx_asm_ntwrite: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi vmovaps ymm0, [rcx] avx_asm_ntwrite_pass_loop: vmovntps [rdi], ymm0 vmovntps [rdi + 32], ymm0 vmovntps [rdi + 64], ymm0 vmovntps [rdi + 96], ymm0 vmovntps [rdi + 128], ymm0 vmovntps [rdi + 160], ymm0 vmovntps [rdi + 192], ymm0 vmovntps [rdi + 224], ymm0 add rsi, 64 add rdi, r15 vmovntps [rdi], ymm0 vmovntps [rdi + 32], ymm0 vmovntps [rdi + 64], ymm0 vmovntps [rdi + 96], ymm0 vmovntps [rdi + 128], ymm0 vmovntps [rdi + 160], ymm0 vmovntps [rdi + 192], ymm0 vmovntps [rdi + 224], ymm0 add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx_ntwrite_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx_ntwrite_iteration_count: cmp r9, rsi jnz avx_asm_ntwrite_pass_loop ; skip iteration decrement if we're not back to start dec r8 jnz avx_asm_ntwrite_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret ; rcx = ptr to arr ; rdx = arr_length ; r8 = iterations avx_asm_copy: push rsi push rdi push rbx push r15 push r14 push r13 xor rsi, rsi mov r9, rdx shr r9, 1 ; start destination at array + length / 2 mov r15, 256 ; load in blocks of 128 bytes mov r13, r9 sub r13, 64 lea rdi, [rcx + rsi * 4] lea r14, [rcx + r9 * 4] avx_copy_pass_loop: vmovaps ymm0, [rdi] vmovaps ymm1, [rdi + 32] vmovaps ymm2, [rdi + 64] vmovaps ymm3, [rdi + 96] vmovaps ymm4, [rdi + 128] vmovaps ymm5, [rdi + 160] vmovaps ymm6, [rdi + 192] vmovaps ymm7, [rdi + 224] vmovaps [r14], ymm0 vmovaps [r14 + 32], ymm1 vmovaps [r14 + 64], ymm2 vmovaps [r14 + 96], ymm3 vmovaps [r14 + 128], ymm4 vmovaps [r14 + 160], ymm5 vmovaps [r14 + 192], ymm6 vmovaps [r14 + 224], ymm7 add rsi, 64 add rdi, r15 ; increment src/dst pointers add r14, r15 cmp r13, rsi ; end location is at half jge avx_copy_pass_loop xor rsi, rsi lea rdi, [rcx + rsi * 4] ; back to start lea r14, [rcx + r9 * 4] dec r8 ; decrement iteration counter jnz avx_copy_pass_loop pop r13 pop r14 pop r15 pop rbx pop rdi pop rsi ret ; changes the ordering of vector sized elements within a cacheline avx_asm_cflip: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break. 128 elements per iteration xor r9, r9 ; not doing start anymore, too lazy to clean up code ; mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi avx_asm_cflip_pass_loop: vmovaps ymm0, [rdi] vmovaps ymm1, [rdi + 32] vmovaps ymm2, [rdi + 64] vmovaps ymm3, [rdi + 96] vmovaps [rdi + 96], ymm0 vmovaps [rdi + 64], ymm1 vmovaps [rdi + 32], ymm2 vmovaps [rdi], ymm3 vmovaps ymm0, [rdi + 128] vmovaps ymm1, [rdi + 160] vmovaps ymm2, [rdi + 192] vmovaps ymm3, [rdi + 224] vmovaps [rdi + 224], ymm0 vmovaps [rdi + 192], ymm1 vmovaps [rdi + 160], ymm2 vmovaps [rdi + 128], ymm3 add rsi, 64 add rdi, r15 vmovaps ymm0, [rdi] vmovaps ymm1, [rdi + 32] vmovaps ymm2, [rdi + 64] vmovaps ymm3, [rdi + 96] vmovaps [rdi + 96], ymm0 vmovaps [rdi + 64], ymm1 vmovaps [rdi + 32], ymm2 vmovaps [rdi], ymm3 vmovaps ymm0, [rdi + 128] vmovaps ymm1, [rdi + 160] vmovaps ymm2, [rdi + 192] vmovaps ymm3, [rdi + 224] vmovaps [rdi + 224], ymm0 vmovaps [rdi + 192], ymm1 vmovaps [rdi + 160], ymm2 vmovaps [rdi + 128], ymm3 add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx_cflip_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx_cflip_iteration_count: cmp r9, rsi jnz avx_asm_cflip_pass_loop ; skip iteration decrement if we're not back to start sub r8, 2 jnz avx_asm_cflip_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret avx_asm_add: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code ; mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi vmovaps ymm4, [rdi] avx_asm_add_pass_loop: ; xmm0 to 5 are considered volatile vaddps ymm0, ymm4, [rdi] vaddps ymm1, ymm4, [rdi + 32] vaddps ymm2, ymm4, [rdi + 64] vaddps ymm3, ymm4, [rdi + 96] vmovaps [rdi], ymm0 vmovaps [rdi + 32], ymm1 vmovaps [rdi + 64], ymm2 vmovaps [rdi + 96], ymm3 vaddps ymm0, ymm4, [rdi + 128] vaddps ymm1, ymm4, [rdi + 160] vaddps ymm2, ymm4, [rdi + 192] vaddps ymm3, ymm4, [rdi + 224] vmovaps [rdi + 128], ymm0 vmovaps [rdi + 160], ymm1 vmovaps [rdi + 192], ymm2 vmovaps [rdi + 224], ymm3 add rsi, 64 add rdi, r15 vaddps ymm0, ymm4, [rdi] vaddps ymm1, ymm4, [rdi + 32] vaddps ymm2, ymm4, [rdi + 64] vaddps ymm3, ymm4, [rdi + 96] vmovaps [rdi], ymm0 vmovaps [rdi + 32], ymm1 vmovaps [rdi + 64], ymm2 vmovaps [rdi + 96], ymm3 vaddps ymm0, ymm4, [rdi + 128] vaddps ymm1, ymm4, [rdi + 160] vaddps ymm2, ymm4, [rdi + 192] vaddps ymm3, ymm4, [rdi + 224] vmovaps [rdi + 128], ymm0 vmovaps [rdi + 160], ymm1 vmovaps [rdi + 192], ymm2 vmovaps [rdi + 224], ymm3 add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx_add_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx_add_iteration_count: cmp r9, rsi jnz avx_asm_add_pass_loop ; skip iteration decrement if we're not back to start sub r8, 2 jnz avx_asm_add_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret avx512_asm_read: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code ; mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi avx512_asm_read_pass_loop: vmovaps zmm0, [rdi] vmovaps zmm1, [rdi + 64] vmovaps zmm2, [rdi + 128] vmovaps zmm3, [rdi + 192] add rsi, 64 add rdi, r15 vmovaps zmm0, [rdi] vmovaps zmm1, [rdi + 64] vmovaps zmm2, [rdi + 128] vmovaps zmm3, [rdi + 192] add rsi, 64 add rdi, r15 cmp rdx, rsi jge asm_avx512_test_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start asm_avx512_test_iteration_count: cmp r9, rsi jnz avx512_asm_read_pass_loop ; skip iteration decrement if we're not back to start dec r8 jnz avx512_asm_read_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret clzero_asm_write: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 ; not doing start anymore, too lazy to clean up code ; mov rsi, r9 ; assume we're passed in an aligned start location O.o xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi clzero_asm_write_pass_loop: mov rax, rdi clzero add rax, 64 clzero add rax, 64 clzero add rax, 64 clzero add rsi, 64 add rdi, r15 mov rax, rdi clzero add rax, 64 clzero add rax, 64 clzero add rax, 64 clzero add rsi, 64 add rdi, r15 cmp rdx, rsi jge clzero_asm_write_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start clzero_asm_write_iteration_count: cmp r9, rsi jnz clzero_asm_write_pass_loop ; skip iteration decrement if we're not back to start dec r8 sfence jnz clzero_asm_write_pass_loop mov rax, 1 pop r14 pop r15 pop rbx pop rdi pop rsi ret sse_asm_read: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi sse_read_pass_loop: ; xmm0 to 5 are considered volatile movaps xmm0, [rdi] movaps xmm1, [rdi + 16] movaps xmm2, [rdi + 32] movaps xmm3, [rdi + 48] movaps xmm0, [rdi + 64] movaps xmm1, [rdi + 80] movaps xmm2, [rdi + 96] movaps xmm3, [rdi + 112] movaps xmm0, [rdi + 128] movaps xmm1, [rdi + 144] movaps xmm2, [rdi + 160] movaps xmm3, [rdi + 176] movaps xmm0, [rdi + 192] movaps xmm2, [rdi + 208] movaps xmm2, [rdi + 224] movaps xmm2, [rdi + 240] add rsi, 64 add rdi, r15 movaps xmm0, [rdi] movaps xmm1, [rdi + 16] movaps xmm2, [rdi + 32] movaps xmm3, [rdi + 48] movaps xmm0, [rdi + 64] movaps xmm1, [rdi + 80] movaps xmm2, [rdi + 96] movaps xmm3, [rdi + 112] movaps xmm0, [rdi + 128] movaps xmm1, [rdi + 144] movaps xmm2, [rdi + 160] movaps xmm3, [rdi + 176] movaps xmm0, [rdi + 192] movaps xmm2, [rdi + 208] movaps xmm2, [rdi + 224] movaps xmm2, [rdi + 240] add rsi, 64 add rdi, r15 cmp rdx, rsi jge sse_test_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start sse_test_iteration_count: cmp r9, rsi jnz sse_read_pass_loop ; skip iteration decrement if we're not back to start dec r8 jnz sse_read_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret ; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations sse_asm_write: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi movaps xmm0, [rdi] sse_write_pass_loop: movaps [rdi], xmm0 movaps [rdi + 16], xmm0 movaps [rdi + 32], xmm0 movaps [rdi + 48], xmm0 movaps [rdi + 64], xmm0 movaps [rdi + 80], xmm0 movaps [rdi + 96], xmm0 movaps [rdi + 112], xmm0 movaps [rdi + 128], xmm0 movaps [rdi + 144], xmm0 movaps [rdi + 160], xmm0 movaps [rdi + 176], xmm0 movaps [rdi + 192], xmm0 movaps [rdi + 208], xmm0 movaps [rdi + 224], xmm0 movaps [rdi + 240], xmm0 add rsi, 64 add rdi, r15 movaps [rdi], xmm0 movaps [rdi + 16], xmm0 movaps [rdi + 32], xmm0 movaps [rdi + 48], xmm0 movaps [rdi + 64], xmm0 movaps [rdi + 80], xmm0 movaps [rdi + 96], xmm0 movaps [rdi + 112], xmm0 movaps [rdi + 128], xmm0 movaps [rdi + 144], xmm0 movaps [rdi + 160], xmm0 movaps [rdi + 176], xmm0 movaps [rdi + 192], xmm0 movaps [rdi + 208], xmm0 movaps [rdi + 224], xmm0 movaps [rdi + 240], xmm0 add rsi, 64 add rdi, r15 cmp rdx, rsi jge sse_write_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start sse_write_iteration_count: cmp r9, rsi jnz sse_write_pass_loop ; skip iteration decrement if we're not back to start dec r8 jg sse_write_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret sse_asm_ntwrite: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi movaps xmm0, [rdi] sse_ntwrite_pass_loop: movntps [rdi], xmm0 movntps [rdi + 16], xmm0 movntps [rdi + 32], xmm0 movntps [rdi + 48], xmm0 movntps [rdi + 64], xmm0 movntps [rdi + 80], xmm0 movntps [rdi + 96], xmm0 movntps [rdi + 112], xmm0 movntps [rdi + 128], xmm0 movntps [rdi + 144], xmm0 movntps [rdi + 160], xmm0 movntps [rdi + 176], xmm0 movntps [rdi + 192], xmm0 movntps [rdi + 208], xmm0 movntps [rdi + 224], xmm0 movntps [rdi + 240], xmm0 add rsi, 64 add rdi, r15 movntps [rdi], xmm0 movntps [rdi + 16], xmm0 movntps [rdi + 32], xmm0 movntps [rdi + 48], xmm0 movntps [rdi + 64], xmm0 movntps [rdi + 80], xmm0 movntps [rdi + 96], xmm0 movntps [rdi + 112], xmm0 movntps [rdi + 128], xmm0 movntps [rdi + 144], xmm0 movntps [rdi + 160], xmm0 movntps [rdi + 176], xmm0 movntps [rdi + 192], xmm0 movntps [rdi + 208], xmm0 movntps [rdi + 224], xmm0 movntps [rdi + 240], xmm0 add rsi, 64 add rdi, r15 cmp rdx, rsi jge sse_ntwrite_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start sse_ntwrite_iteration_count: cmp r9, rsi jnz sse_ntwrite_pass_loop ; skip iteration decrement if we're not back to start dec r8 jg sse_ntwrite_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret ; rcx = ptr to arr ; rdx = arr_length ; r8 = iterations sse_asm_copy: push rsi push rdi push rbx push r15 push r14 push r13 xor rsi, rsi mov r9, rdx shr r9, 1 ; start destination at array + length / 2 mov r15, 256 ; load in blocks of 128 bytes mov r13, r9 sub r13, 64 lea rdi, [rcx + rsi * 4] lea r14, [rcx + r9 * 4] sse_copy_pass_loop: movaps xmm0, [rdi] movaps xmm1, [rdi + 16] movaps xmm2, [rdi + 32] movaps xmm3, [rdi + 48] movaps xmm4, [rdi + 64] movaps xmm5, [rdi + 80] movaps xmm6, [rdi + 96] movaps xmm7, [rdi + 112] movaps [r14], xmm0 movaps [r14 + 16], xmm1 movaps [r14 + 32], xmm2 movaps [r14 + 48], xmm3 movaps [r14 + 64], xmm4 movaps [r14 + 80], xmm5 movaps [r14 + 96], xmm6 movaps [r14 + 112], xmm7 movaps xmm0, [rdi + 128] movaps xmm1, [rdi + 144] movaps xmm2, [rdi + 160] movaps xmm3, [rdi + 176] movaps xmm4, [rdi + 192] movaps xmm5, [rdi + 208] movaps xmm6, [rdi + 224] movaps xmm7, [rdi + 240] movaps [r14 + 128], xmm0 movaps [r14 + 144], xmm1 movaps [r14 + 160], xmm2 movaps [r14 + 176], xmm3 movaps [r14 + 192], xmm4 movaps [r14 + 208], xmm5 movaps [r14 + 224], xmm6 movaps [r14 + 240], xmm7 add rsi, 64 add rdi, r15 ; increment src/dst pointers add r14, r15 cmp r13, rsi ; end location is at half jge sse_copy_pass_loop xor rsi, rsi lea rdi, [rcx + rsi * 4] ; back to start lea r14, [rcx + r9 * 4] dec r8 ; decrement iteration counter jnz sse_copy_pass_loop pop r13 pop r14 pop r15 pop rbx pop rdi pop rsi ret sse_asm_add: push rsi push rdi push rbx push r15 push r14 mov r15, 256 ; load in blocks of 256 bytes sub rdx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor r9, r9 xor rsi, rsi xor rbx, rbx lea rdi, [rcx + rsi * 4] mov r14, rdi movaps xmm5, [rdi] sse_add_pass_loop: movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi] addps xmm1, [rdi + 16] addps xmm2, [rdi + 32] addps xmm3, [rdi + 48] movaps [rdi], xmm0 movaps [rdi + 16], xmm1 movaps [rdi + 32], xmm2 movaps [rdi + 48], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 64] addps xmm1, [rdi + 80] addps xmm2, [rdi + 96] addps xmm3, [rdi + 112] movaps [rdi + 64], xmm0 movaps [rdi + 80], xmm1 movaps [rdi + 96], xmm2 movaps [rdi + 112], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 128] addps xmm1, [rdi + 144] addps xmm2, [rdi + 160] addps xmm3, [rdi + 176] movaps [rdi + 128], xmm0 movaps [rdi + 144], xmm1 movaps [rdi + 160], xmm2 movaps [rdi + 176], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 192] addps xmm1, [rdi + 208] addps xmm2, [rdi + 224] addps xmm3, [rdi + 240] movaps [rdi + 192], xmm0 movaps [rdi + 208], xmm1 movaps [rdi + 224], xmm2 movaps [rdi + 240], xmm3 add rsi, 64 add rdi, r15 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi] addps xmm1, [rdi + 16] addps xmm2, [rdi + 32] addps xmm3, [rdi + 48] movaps [rdi], xmm0 movaps [rdi + 16], xmm1 movaps [rdi + 32], xmm2 movaps [rdi + 48], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 64] addps xmm1, [rdi + 80] addps xmm2, [rdi + 96] addps xmm3, [rdi + 112] movaps [rdi + 64], xmm0 movaps [rdi + 80], xmm1 movaps [rdi + 96], xmm2 movaps [rdi + 112], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 128] addps xmm1, [rdi + 144] addps xmm2, [rdi + 160] addps xmm3, [rdi + 176] movaps [rdi + 128], xmm0 movaps [rdi + 144], xmm1 movaps [rdi + 160], xmm2 movaps [rdi + 176], xmm3 movaps xmm0, xmm5 movaps xmm1, xmm5 movaps xmm2, xmm5 movaps xmm3, xmm5 addps xmm0, [rdi + 192] addps xmm1, [rdi + 208] addps xmm2, [rdi + 224] addps xmm3, [rdi + 240] movaps [rdi + 192], xmm0 movaps [rdi + 208], xmm1 movaps [rdi + 224], xmm2 movaps [rdi + 240], xmm3 add rsi, 64 add rdi, r15 cmp rdx, rsi jge sse_add_iteration_count mov rsi, rbx lea rdi, [rcx + rsi * 4] ; back to start sse_add_iteration_count: cmp r9, rsi jnz sse_add_pass_loop ; skip iteration decrement if we're not back to start sub r8, 2 jg sse_add_pass_loop pop r14 pop r15 pop rbx pop rdi pop rsi ret ; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations repmovsb_copy: push r15 push r14 push r13 push r12 push rsi push rdi push rax cld ; source = rsi, destination = rdi, count (in bytes) = rcx mov rsi, rcx ; set source shr rdx, 1 ; set destination = source + (size / 2) mov rdi, rcx add rdi, rdx mov rcx, rdx ; set count = (size / 2) * (4 bytes per fp32 element) shl rcx, 2 mov r12, rsi mov r13, rdi mov r14, rcx repmovsb_copy_pass_loop: mov rsi, r12 mov rdi, r13 mov rcx, r14 rep movsb dec r8 jnz repmovsb_copy_pass_loop movss xmm0, [r12] pop rax pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ; rcx = float ptr to arr, rdx = fp32 elements in arr, r8 = iterations repstosb_write: push r15 push r14 push r13 push r12 push rsi push rdi push rax cld ; source = value in al, destination = rdi, count (in bytes) = rcx mov al, 1 ; set source mov r13, rcx ; destination = start of arr mov r14, rdx shl r14, 2 ; count = (nr of FP32 elements) * 4 repstosb_write_pass_loop: mov rdi, r13 mov rcx, r14 rep stosb dec r8 jnz repstosb_write_pass_loop movss xmm0, [r13] pop rax pop rdi pop rsi pop r12 pop r13 pop r14 pop r15 ret ================================================ FILE: MemoryBandwidth/MemoryBandwidth/MemoryBandwidthFunctions32.asm ================================================ section .text bits 32 global @sse_asm_read32@12 global sse_asm_read32 global @mmx_asm_read32@12 global mmx_asm_read32 global @scalar_asm_read32@12 global scalar_asm_read32 global @dummy@12 @dummy@12: mov eax, [esp] mov [esp + 4], eax add esp, 4 ret ; ecx = ptr to float array ; edx = arr length ; [esp + 4] = iterations, put this into eax sse_asm_read32: @sse_asm_read32@12: mov eax, [esp + 4] push ecx push edx push esi push edi sub edx, 128 ; last iteration: rsi == rdx. rsi > rdx = break xor esi, esi ; index into array = 0 lea edi, [ecx + esi * 4] sse_read32_pass_loop: movaps xmm0, [edi] movaps xmm1, [edi + 16] movaps xmm2, [edi + 32] movaps xmm3, [edi + 48] movaps xmm0, [edi + 64] movaps xmm1, [edi + 80] movaps xmm2, [edi + 96] movaps xmm3, [edi + 112] movaps xmm0, [edi + 128] movaps xmm1, [edi + 144] movaps xmm2, [edi + 160] movaps xmm3, [edi + 176] movaps xmm0, [edi + 192] movaps xmm2, [edi + 208] movaps xmm2, [edi + 224] movaps xmm2, [edi + 240] add esi, 64 add edi, 256 movaps xmm0, [edi] movaps xmm1, [edi + 16] movaps xmm2, [edi + 32] movaps xmm3, [edi + 48] movaps xmm0, [edi + 64] movaps xmm1, [edi + 80] movaps xmm2, [edi + 96] movaps xmm3, [edi + 112] movaps xmm0, [edi + 128] movaps xmm1, [edi + 144] movaps xmm2, [edi + 160] movaps xmm3, [edi + 176] movaps xmm0, [edi + 192] movaps xmm2, [edi + 208] movaps xmm2, [edi + 224] movaps xmm2, [edi + 240] add esi, 64 add edi, 256 cmp edx, esi ; bounds check, expects size to be multiple of 64 elements jge sse_read32_pass_loop ; zero the index, get back to start, decrement iteration count xor esi, esi lea edi, [ecx + esi * 4] dec eax jnz sse_read32_pass_loop pop edi pop esi pop edx pop ecx ; I don't understand this calling convention ; nothing I looked up explains it mov eax, [esp] mov [esp + 4], eax add esp, 4 ret mmx_asm_read32: @mmx_asm_read32@12: mov eax, [esp + 4] push ecx push edx push esi push edi sub edx, 64 ; last iteration: rsi == rdx. rsi > rdx = break xor esi, esi ; index into array = 0 lea edi, [ecx + esi * 4] mmx_read32_pass_loop: movq mm0, [edi] movq mm1, [edi + 8] movq mm2, [edi + 16] movq mm3, [edi + 24] movq mm4, [edi + 32] movq mm5, [edi + 40] movq mm6, [edi + 48] movq mm7, [edi + 56] movq mm0, [edi + 64] movq mm1, [edi + 72] movq mm2, [edi + 80] movq mm3, [edi + 88] movq mm4, [edi + 96] movq mm5, [edi + 104] movq mm6, [edi + 112] movq mm7, [edi + 120] movq mm0, [edi + 128] movq mm1, [edi + 136] movq mm2, [edi + 144] movq mm3, [edi + 152] movq mm4, [edi + 160] movq mm5, [edi + 168] movq mm6, [edi + 176] movq mm7, [edi + 184] movq mm0, [edi + 192] movq mm1, [edi + 200] movq mm2, [edi + 208] movq mm3, [edi + 216] movq mm4, [edi + 224] movq mm5, [edi + 232] movq mm6, [edi + 240] movq mm7, [edi + 248] add esi, 64 add edi, 256 cmp edx, esi ; bounds check, expects size to be multiple of 64 elements jge mmx_read32_pass_loop ; zero the index, get back to start, decrement iteration count xor esi, esi lea edi, [ecx + esi * 4] dec eax jnz mmx_read32_pass_loop pop edi pop esi pop edx pop ecx mov eax, [esp] mov [esp + 4], eax add esp, 4 fld1 ret ; [esp + 4] = iterations scalar_asm_read32: @scalar_asm_read32@12: push ebx push ecx push edx push esi push edi sub edx, 32 ; last iteration: rsi == rdx. rsi > rdx = break xor esi, esi ; index into array = 0 lea edi, [ecx + esi * 4] scalar_read32_pass_loop: mov eax, [edi] mov ebx, [edi + 4] mov eax, [edi + 8] mov ebx, [edi + 12] mov eax, [edi + 16] mov ebx, [edi + 20] mov eax, [edi + 24] mov ebx, [edi + 28] mov eax, [edi + 32] mov ebx, [edi + 36] mov eax, [edi + 40] mov ebx, [edi + 44] mov eax, [edi + 48] mov ebx, [edi + 52] mov eax, [edi + 56] mov ebx, [edi + 60] mov eax, [edi + 64] mov ebx, [edi + 68] mov eax, [edi + 72] mov ebx, [edi + 76] mov eax, [edi + 80] mov ebx, [edi + 84] mov eax, [edi + 88] mov ebx, [edi + 92] mov eax, [edi + 96] mov ebx, [edi + 100] mov eax, [edi + 104] mov ebx, [edi + 108] mov eax, [edi + 112] mov ebx, [edi + 116] mov eax, [edi + 120] mov ebx, [edi + 124] add esi, 32 add edi, 128 cmp edx, esi ; bounds check, expects size to be multiple of 64 elements jge scalar_read32_pass_loop ; zero the index, get back to start, decrement iteration count xor esi, esi lea edi, [ecx + esi * 4] dec dword [esp + 24] jnz scalar_read32_pass_loop pop edi pop esi pop edx pop ecx pop ebx mov eax, [esp] mov [esp + 4], eax add esp, 4 fld1 ret ================================================ FILE: MemoryBandwidth/MemoryBandwidth.c ================================================ // MemoryBandwidth.c : Version for linux (x86 and ARM) // Mostly the same as the x86-only VS version, but a bit more manual #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #ifndef __MINGW32__ #include #include #include #include #include #include #include #include #include "../Common/perfmon.h" #endif #ifdef NUMA #include #include #endif #ifndef gettid #define gettid() ((pid_t)syscall(SYS_gettid)) #endif #define HUGEPAGE_HACK 1 #undef HUGEPAGE_HACK #pragma GCC diagnostic ignored "-Wattributes" int default_test_sizes[] = { 2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 256, 384, 400, 448, 512, 600, 768, 1024, 1536, 2048, 2560, 3072, 4096, 5120, 6144, 8192, 10240, 12288, 14336, 15360, 16384, 18432, 20480, 24567, 32768, 40960, 51200, 61440, 65536, 98304, 131072, 262144, 393216, 524288, 1048576, 1572864, 2097152, 3145728 }; typedef struct BandwidthTestThreadData { uint64_t iterations; uint64_t arr_length; uint64_t start; float* arr; float bw; // written to by the thread #ifdef NUMA cpu_set_t cpuset; // if numa set, will set affinity #endif } BandwidthTestThreadData; float MeasureBw(uint64_t sizeKb, uint64_t iterations, uint64_t threads, int shared, int nopBytes, int coreNode, int memNode); #ifdef __x86_64 #include float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute((ms_abi)); extern float sse_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float sse_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float sse_ntwrite(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float avx512_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float avx512_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float avx512_copy(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float avx512_add(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float repmovsb_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float repmovsd_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float repstosb_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float repstosd_write(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); float (*bw_func)(float*, uint64_t, uint64_t, uint64_t start) __attribute__((ms_abi)); #else float scalar_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start); float (*bw_func)(float*, uint64_t, uint64_t, uint64_t start); #endif #ifdef __x86_64 extern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); extern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) __attribute__((ms_abi)); #else extern float asm_read(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start); extern float asm_write(float* arr, uint64_t arr_length, uint64_t iterations, uint64_t start); extern float asm_copy(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start); extern float asm_cflip(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start); extern float asm_add(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start); #endif #ifdef __aarch64__ extern void flush_icache(void *arr, uint64_t length); #endif #ifdef __x86_64 __attribute((ms_abi)) float instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) { #else float instr_read(float *arr, uint64_t arr_length, uint64_t iterations, uint64_t start) { #endif void (*nopfunc)(uint64_t) __attribute((ms_abi)) = (__attribute((ms_abi)) void(*)(uint64_t))arr; for (int iterIdx = 0; iterIdx < iterations; iterIdx++) nopfunc(iterations); return 1.1f; } void FillInstructionArray(uint64_t *nops, uint64_t sizeKb, int nopSize, int branchInterval); uint64_t GetIterationCount(uint64_t testSize, uint64_t threads); void *ReadBandwidthTestThread(void *param); void *allocate_memory(size_t bytes, unsigned int threadOffset); uint64_t gbToTransfer = 512; int branchInterval = 0; cpu_set_t global_cpuset; int hardaffinity = 0; #ifdef NUMA #define NUMA_STRIPE 1 #define NUMA_SEQ 2 #define NUMA_CROSSNODE 3 #define NUMA_AUTO 4 #define NUMA_DOUBLE_CROSSNODE 5 int numa = 0; #endif int pmon = 0; int main(int argc, char *argv[]) { int threads = 1; int cpuid_data[4]; int shared = 1; int sleepTime = 0; int methodSet = 0, nopBytes = 0, testBankConflict = 0; int testBankConflict128 = 0; int singleSize = 0, autothreads = 0; int testSizeCount = sizeof(default_test_sizes) / sizeof(int); #ifdef __x86_64 int sseSupported = 0, avxSupported = 0, avx512Supported = 0; sseSupported = __builtin_cpu_supports("sse"); if (sseSupported) fprintf(stderr, "SSE supported\n"); avxSupported = __builtin_cpu_supports("avx"); if (avxSupported) fprintf(stderr, "AVX supported\n"); // gcc has no __builtin_cpu_supports for avx512, so check by hand. // eax = 7 -> extended features, bit 16 of ebx = avx512f uint32_t cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx; __cpuid_count(7, 0, cpuidEax, cpuidEbx, cpuidEcx, cpuidEdx); if (cpuidEbx & (1UL << 16)) { fprintf(stderr, "AVX512 supported\n"); avx512Supported = 1; } #endif bw_func = asm_read; for (int argIdx = 1; argIdx < argc; argIdx++) { if (*(argv[argIdx]) == '-') { char *arg = argv[argIdx] + 1; if (strncmp(arg, "threads", 7) == 0) { argIdx++; threads = atoi(argv[argIdx]); fprintf(stderr, "Using %d threads\n", threads); } else if (strncmp(arg, "shared", 6) == 0) { shared = 1; fprintf(stderr, "Using shared array\n"); } else if (strncmp(arg, "hardaffinity", 12) == 0) { hardaffinity = 1; CPU_ZERO(&global_cpuset); CPU_SET(0, &global_cpuset); CPU_SET(1, &global_cpuset); sched_setaffinity(gettid(), sizeof(cpu_set_t), &global_cpuset); fprintf(stderr, "hardaffinity 0,1\n"); } else if (strncmp(arg, "sleep", 5) == 0) { argIdx++; sleepTime = atoi(argv[argIdx]); fprintf(stderr, "Sleeping for %d second between tests\n", sleepTime); } else if (strncmp(arg, "private", 7) == 0) { shared = 0; fprintf(stderr, "Using private array for each thread\n"); } else if (strncmp(arg, "branchinterval", 14) == 0) { argIdx++; branchInterval = atoi(argv[argIdx]); fprintf(stderr, "Will add a branch roughly every %d bytes\n", branchInterval * 8); } else if (strncmp(arg, "sizekb", 6) == 0) { argIdx++; singleSize = atoi(argv[argIdx]); fprintf(stderr, "Testing %d KB\n", singleSize); } else if (strncmp(arg, "data", 4) == 0) { argIdx++; gbToTransfer = atoi(argv[argIdx]); fprintf(stderr, "Base GB to transfer: %lu\n", gbToTransfer); } else if (strncmp(arg, "autothreads", 11) == 0) { argIdx++; autothreads = atoi(argv[argIdx]); fprintf(stderr, "Testing bw scaling up to %d threads\n", autothreads); } #ifndef __MINGW32__ else if (strncmp(arg, "pmon", 4) == 0) { pmon = 1; fprintf(stderr, "Using hardware performance monitoring\n"); } #endif #ifdef NUMA else if (strncmp(arg, "numa", 4) == 0) { argIdx++; fprintf(stderr, "Attempting to be NUMA aware\n"); if (strncmp(argv[argIdx], "crossnode", 4) == 0) { fprintf(stderr, "Testing node to node bandwidth, 1 GB test size\n"); numa = NUMA_CROSSNODE; singleSize = 1048576; } else if (strncmp(argv[argIdx], "seq", 3) == 0) { fprintf(stderr, "Filling NUMA nodes one by one\n"); numa = NUMA_SEQ; } else if (strncmp(argv[argIdx], "stripe", 6) == 0) { fprintf(stderr, "Striping threads across NUMA nodes\n"); numa = NUMA_STRIPE; } else if (strncmp(argv[argIdx], "doublecross", 10) == 0) { fprintf(stderr, "Crossnode, with two nodes\n"); numa = NUMA_DOUBLE_CROSSNODE; } } #endif else if (strncmp(arg, "method", 6) == 0) { methodSet = 1; argIdx++; if (strncmp(argv[argIdx], "scalar", 6) == 0) { bw_func = scalar_read; fprintf(stderr, "Using scalar C code\n"); } else if (strncmp(argv[argIdx], "asm", 3) == 0) { bw_func = asm_read; fprintf(stderr, "Using ASM code (AVX or NEON)\n"); } else if (strncmp(argv[argIdx], "write", 5) == 0) { bw_func = asm_write; fprintf(stderr, "Using ASM code (AVX or NEON), testing write bw instead of read\n"); #ifdef __x86_64 if (avx512Supported) { fprintf(stderr, "Using AVX-512 because that's supported\n"); bw_func = avx512_write; } #endif } else if (strncmp(argv[argIdx], "copy", 4) == 0) { bw_func = asm_copy; fprintf(stderr, "Using ASM code (AVX or NEON), testing copy bw instead of read\n"); #ifdef __x86_64 if (avx512Supported) { fprintf(stderr, "Using AVX-512 because that's supported\n"); bw_func = avx512_copy; } #endif } else if (strncmp(argv[argIdx], "cflip", 5) == 0) { bw_func = asm_cflip; fprintf(stderr, "Using ASM code (AVX or NEON), flipping order of elements within cacheline\n"); } else if (strncmp(argv[argIdx], "add", 3) == 0) { bw_func = asm_add; fprintf(stderr, "Using ASM code (AVX or NEON), adding constant to array\n"); #ifdef __x86_64 if (avx512Supported) { fprintf(stderr, "Using AVX-512 because that's supported\n"); bw_func = avx512_add; } #endif } else if (strncmp(argv[argIdx], "instr8", 6) == 0) { nopBytes = 8; bw_func = instr_read; fprintf(stderr, "Testing instruction fetch bandwidth with 8 byte instructions.\n"); } else if (strncmp(argv[argIdx], "instr4", 6) == 0) { nopBytes = 4; bw_func = instr_read; fprintf(stderr, "Testing instruction fetch bandwidth with 4 byte instructions.\n"); } else if (strncmp(argv[argIdx], "instr2", 6) == 0) { nopBytes = 2; bw_func = instr_read; fprintf(stderr, "Testing instruction fetch bandwith with 2 byte instructions.\n"); } #ifdef __x86_64 else if (strncmp(argv[argIdx], "instrk8_4", 8) == 0) { nopBytes = 3; bw_func = instr_read; fprintf(stderr, "Testing instruction bandwidth using 4B NOP encoding recommended in the Athlon optimization manual\n"); } else if (strncmp(argv[argIdx], "instr_funcs", 11) == 0) { nopBytes = -1; bw_func = instr_read; fprintf(stderr, "Testing instruction bandwidth with call to function/return blocks\n"); } else if (strncmp(argv[argIdx], "avx512", 6) == 0) { bw_func = avx512_read; fprintf(stderr, "Using ASM code, AVX512\n"); } else if (strncmp(argv[argIdx], "sse_write", 9) == 0) { bw_func = sse_write; fprintf(stderr, "Using SSE to test write bandwidth\n"); } else if (strncmp(argv[argIdx], "sse_ntwrite", 11) == 0) { bw_func = sse_ntwrite; fprintf(stderr, "Using SSE NT writes to test write bandwidth\n"); } else if (strncmp(argv[argIdx], "sse", 3) == 0) { bw_func = sse_read; fprintf(stderr, "Using ASM code, SSE\n"); } else if (strncmp(argv[argIdx], "avx", 3) == 0) { bw_func = asm_read; fprintf(stderr, "Using ASM code, AVX\n"); } else if (strncmp(argv[argIdx], "repmovsb", 8) == 0) { bw_func = repmovsb_copy; fprintf(stderr, "Using REP MOVSB to copy\n"); } else if (strncmp(argv[argIdx], "repmovsd", 8) == 0) { bw_func = repmovsd_copy; fprintf(stderr, "Using REP MOVSD to copy\n"); } else if (strncmp(argv[argIdx], "repstosb", 9) == 0) { bw_func = repstosb_write; fprintf(stderr, "Using REP STOSB to write\n"); } else if (strncmp(argv[argIdx], "repstosd", 9) == 0) { bw_func = repstosd_write; fprintf(stderr, "Using REP STOSD to write\n"); } #endif } } else { fprintf(stderr, "Expected - parameter\n"); fprintf(stderr, "Usage: [-threads ] [-private] [-method ] [-sleep